From 0c484ca0eaf1f6bac7f00700c249cfe3cb33bfaa Mon Sep 17 00:00:00 2001 From: Toni Lukkaroinen Date: Sun, 12 Jan 2025 10:12:21 +0200 Subject: [PATCH] Add SHA256 and SHA256ET10 algos. --- cpu-miner.c | 922 +- miner.h | 90 +- scrypt-arm.S | 5810 +++++++- scrypt-arm.S.orig | 1186 ++ scrypt-ppc.S | 3550 +++-- scrypt-ppc.S.orig | 1148 ++ scrypt-x64.S | 15906 ++++++++++++++++++---- scrypt-x64.S.orig | 2907 ++++ scrypt-x86.S | 2560 +++- scrypt-x86.S.orig | 830 ++ sha2-arm.S | 12470 +++++++++++++++-- sha2-arm.S.orig | 1583 +++ sha2-ppc.S | 16101 ++++++++++++++++++++-- sha2-ppc.S.orig | 2007 +++ sha2-x64.S | 31643 +++++++++++++++++++++++++++++++++++++++----- sha2-x64.S.orig | 4222 ++++++ sha2-x86.S | 5473 +++++++- sha2-x86.S.orig | 1193 ++ sha2.c | 451 +- 19 files changed, 99583 insertions(+), 10469 deletions(-) create mode 100644 scrypt-arm.S.orig create mode 100644 scrypt-ppc.S.orig create mode 100644 scrypt-x64.S.orig create mode 100644 scrypt-x86.S.orig create mode 100644 sha2-arm.S.orig create mode 100644 sha2-ppc.S.orig create mode 100644 sha2-x64.S.orig create mode 100644 sha2-x86.S.orig diff --git a/cpu-miner.c b/cpu-miner.c index ef2fc7672..586b81430 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -38,8 +38,8 @@ #include "compat.h" #include "miner.h" -#define PROGRAM_NAME "minerd" -#define LP_SCANTIME 60 +#define PROGRAM_NAME "minerd" +#define LP_SCANTIME 60 #ifdef __linux /* Linux specific policy and affinity management */ #include @@ -86,28 +86,36 @@ static inline void affine_to_cpu(int id, int cpu) { } #endif - -enum workio_commands { + +enum workio_commands +{ WC_GET_WORK, WC_SUBMIT_WORK, }; -struct workio_cmd { - enum workio_commands cmd; - struct thr_info *thr; - union { - struct work *work; +struct workio_cmd +{ + enum workio_commands cmd; + struct thr_info *thr; + union + { + struct work *work; } u; }; -enum algos { - ALGO_SCRYPT, /* scrypt(1024,1,1) */ - ALGO_SHA256D, /* SHA-256d */ +enum algos +{ + ALGO_SCRYPT, /* scrypt(1024,1,1) */ + ALGO_SHA256D, /* SHA-256d */ + ALGO_SHA256, /* SHA-256 */ + ALGO_SHA256ET10, /* SHA-256 */ }; static const char *algo_names[] = { - [ALGO_SCRYPT] = "scrypt", - [ALGO_SHA256D] = "sha256d", + [ALGO_SCRYPT] = "scrypt", + [ALGO_SHA256D] = "sha256d", + [ALGO_SHA256] = "sha256", + [ALGO_SHA256ET10] = "sha256ET10", }; bool opt_debug = false; @@ -157,7 +165,8 @@ static double *thr_hashrates; #ifdef HAVE_GETOPT_LONG #include #else -struct option { +struct option +{ const char *name; int has_arg; int *flag; @@ -169,9 +178,11 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ - scrypt scrypt(1024, 1, 1) (default)\n\ - scrypt:N scrypt(N, 1, 1)\n\ - sha256d SHA-256d\n\ + scrypt scrypt(1024, 1, 1) (default)\n\ + scrypt:N scrypt(N, 1, 1)\n\ + sha256d SHA-256d\n\ + sha256 SHA-256\n\ + sha256ET10 SHA-256ET10\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -196,14 +207,14 @@ Options:\n\ -D, --debug enable debug output\n\ -P, --protocol-dump verbose dump of protocol-level activities\n" #ifdef HAVE_SYSLOG_H -"\ + "\ -S, --syslog use system log for output messages\n" #endif #ifndef WIN32 -"\ + "\ -B, --background run the miner in the background\n" #endif -"\ + "\ --benchmark run in offline benchmark mode\n\ -c, --config=FILE load a JSON-format configuration file\n\ -V, --version display version information and exit\n\ @@ -212,50 +223,50 @@ Options:\n\ static char const short_options[] = #ifndef WIN32 - "B" + "B" #endif #ifdef HAVE_SYSLOG_H - "S" + "S" #endif - "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:V"; + "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:V"; static struct option const options[] = { - { "algo", 1, NULL, 'a' }, + {"algo", 1, NULL, 'a'}, #ifndef WIN32 - { "background", 0, NULL, 'B' }, + {"background", 0, NULL, 'B'}, #endif - { "benchmark", 0, NULL, 1005 }, - { "cert", 1, NULL, 1001 }, - { "coinbase-addr", 1, NULL, 1013 }, - { "coinbase-sig", 1, NULL, 1015 }, - { "config", 1, NULL, 'c' }, - { "debug", 0, NULL, 'D' }, - { "help", 0, NULL, 'h' }, - { "no-gbt", 0, NULL, 1011 }, - { "no-getwork", 0, NULL, 1010 }, - { "no-longpoll", 0, NULL, 1003 }, - { "no-redirect", 0, NULL, 1009 }, - { "no-stratum", 0, NULL, 1007 }, - { "pass", 1, NULL, 'p' }, - { "protocol-dump", 0, NULL, 'P' }, - { "proxy", 1, NULL, 'x' }, - { "quiet", 0, NULL, 'q' }, - { "retries", 1, NULL, 'r' }, - { "retry-pause", 1, NULL, 'R' }, - { "scantime", 1, NULL, 's' }, + {"benchmark", 0, NULL, 1005}, + {"cert", 1, NULL, 1001}, + {"coinbase-addr", 1, NULL, 1013}, + {"coinbase-sig", 1, NULL, 1015}, + {"config", 1, NULL, 'c'}, + {"debug", 0, NULL, 'D'}, + {"help", 0, NULL, 'h'}, + {"no-gbt", 0, NULL, 1011}, + {"no-getwork", 0, NULL, 1010}, + {"no-longpoll", 0, NULL, 1003}, + {"no-redirect", 0, NULL, 1009}, + {"no-stratum", 0, NULL, 1007}, + {"pass", 1, NULL, 'p'}, + {"protocol-dump", 0, NULL, 'P'}, + {"proxy", 1, NULL, 'x'}, + {"quiet", 0, NULL, 'q'}, + {"retries", 1, NULL, 'r'}, + {"retry-pause", 1, NULL, 'R'}, + {"scantime", 1, NULL, 's'}, #ifdef HAVE_SYSLOG_H - { "syslog", 0, NULL, 'S' }, + {"syslog", 0, NULL, 'S'}, #endif - { "threads", 1, NULL, 't' }, - { "timeout", 1, NULL, 'T' }, - { "url", 1, NULL, 'o' }, - { "user", 1, NULL, 'u' }, - { "userpass", 1, NULL, 'O' }, - { "version", 0, NULL, 'V' }, - { 0, 0, 0, 0 } -}; - -struct work { + {"threads", 1, NULL, 't'}, + {"timeout", 1, NULL, 'T'}, + {"url", 1, NULL, 'o'}, + {"user", 1, NULL, 'u'}, + {"userpass", 1, NULL, 'O'}, + {"version", 0, NULL, 'V'}, + {0, 0, 0, 0}}; + +struct work +{ uint32_t data[32]; uint32_t target[8]; @@ -291,25 +302,28 @@ static inline void work_copy(struct work *dest, const struct work *src) dest->workid = strdup(src->workid); if (src->job_id) dest->job_id = strdup(src->job_id); - if (src->xnonce2) { + if (src->xnonce2) + { dest->xnonce2 = malloc(src->xnonce2_len); memcpy(dest->xnonce2, src->xnonce2, src->xnonce2_len); } } static bool jobj_binary(const json_t *obj, const char *key, - void *buf, size_t buflen) + void *buf, size_t buflen) { const char *hexstr; json_t *tmp; tmp = json_object_get(obj, key); - if (unlikely(!tmp)) { + if (unlikely(!tmp)) + { applog(LOG_ERR, "JSON key '%s' not found", key); return false; } hexstr = json_string_value(tmp); - if (unlikely(!hexstr)) { + if (unlikely(!hexstr)) + { applog(LOG_ERR, "JSON key '%s' is not a string", key); return false; } @@ -323,11 +337,13 @@ static bool work_decode(const json_t *val, struct work *work) { int i; - if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) { + if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) + { applog(LOG_ERR, "JSON invalid data"); goto err_out; } - if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) { + if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) + { applog(LOG_ERR, "JSON invalid target"); goto err_out; } @@ -354,7 +370,7 @@ static bool gbt_work_decode(const json_t *val, struct work *work) unsigned char *tx = NULL; int tx_count, tx_size; unsigned char txc_vi[9]; - unsigned char (*merkle_tree)[32] = NULL; + unsigned char(*merkle_tree)[32] = NULL; bool coinbase_append = false; bool submit_coinbase = false; bool segwit = false; @@ -362,9 +378,11 @@ static bool gbt_work_decode(const json_t *val, struct work *work) bool rc = false; tmp = json_object_get(val, "rules"); - if (tmp && json_is_array(tmp)) { + if (tmp && json_is_array(tmp)) + { n = json_array_size(tmp); - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) + { const char *s = json_string_value(json_array_get(tmp, i)); if (!s) continue; @@ -374,9 +392,11 @@ static bool gbt_work_decode(const json_t *val, struct work *work) } tmp = json_object_get(val, "mutable"); - if (tmp && json_is_array(tmp)) { + if (tmp && json_is_array(tmp)) + { n = json_array_size(tmp); - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) + { const char *s = json_string_value(json_array_get(tmp, i)); if (!s) continue; @@ -388,48 +408,56 @@ static bool gbt_work_decode(const json_t *val, struct work *work) } tmp = json_object_get(val, "height"); - if (!tmp || !json_is_integer(tmp)) { + if (!tmp || !json_is_integer(tmp)) + { applog(LOG_ERR, "JSON invalid height"); goto out; } work->height = json_integer_value(tmp); tmp = json_object_get(val, "version"); - if (!tmp || !json_is_integer(tmp)) { + if (!tmp || !json_is_integer(tmp)) + { applog(LOG_ERR, "JSON invalid version"); goto out; } version = json_integer_value(tmp); - if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) { + if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) + { applog(LOG_ERR, "JSON invalid previousblockhash"); goto out; } tmp = json_object_get(val, "curtime"); - if (!tmp || !json_is_integer(tmp)) { + if (!tmp || !json_is_integer(tmp)) + { applog(LOG_ERR, "JSON invalid curtime"); goto out; } curtime = json_integer_value(tmp); - if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) { + if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) + { applog(LOG_ERR, "JSON invalid bits"); goto out; } /* find count and size of transactions */ txa = json_object_get(val, "transactions"); - if (!txa || !json_is_array(txa)) { + if (!txa || !json_is_array(txa)) + { applog(LOG_ERR, "JSON invalid transactions"); goto out; } tx_count = json_array_size(txa); tx_size = 0; - for (i = 0; i < tx_count; i++) { + for (i = 0; i < tx_count; i++) + { const json_t *tx = json_array_get(txa, i); const char *tx_hex = json_string_value(json_object_get(tx, "data")); - if (!tx_hex) { + if (!tx_hex) + { applog(LOG_ERR, "JSON invalid transactions"); goto out; } @@ -438,131 +466,157 @@ static bool gbt_work_decode(const json_t *val, struct work *work) /* build coinbase transaction */ tmp = json_object_get(val, "coinbasetxn"); - if (tmp) { + if (tmp) + { const char *cbtx_hex = json_string_value(json_object_get(tmp, "data")); cbtx_size = cbtx_hex ? strlen(cbtx_hex) / 2 : 0; cbtx = malloc(cbtx_size + 100); - if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) { + if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) + { applog(LOG_ERR, "JSON invalid coinbasetxn"); goto out; } - } else { + } + else + { int64_t cbvalue; - if (!pk_script_size) { - if (allow_getwork) { + if (!pk_script_size) + { + if (allow_getwork) + { applog(LOG_INFO, "No payout address provided, switching to getwork"); have_gbt = false; - } else + } + else applog(LOG_ERR, "No payout address provided"); goto out; } tmp = json_object_get(val, "coinbasevalue"); - if (!tmp || !json_is_number(tmp)) { + if (!tmp || !json_is_number(tmp)) + { applog(LOG_ERR, "JSON invalid coinbasevalue"); goto out; } cbvalue = json_is_integer(tmp) ? json_integer_value(tmp) : json_number_value(tmp); cbtx = malloc(256); - le32enc((uint32_t *)cbtx, 1); /* version */ - cbtx[4] = 1; /* in-counter */ - memset(cbtx+5, 0x00, 32); /* prev txout hash */ - le32enc((uint32_t *)(cbtx+37), 0xffffffff); /* prev txout index */ + le32enc((uint32_t *)cbtx, 1); /* version */ + cbtx[4] = 1; /* in-counter */ + memset(cbtx + 5, 0x00, 32); /* prev txout hash */ + le32enc((uint32_t *)(cbtx + 37), 0xffffffff); /* prev txout index */ cbtx_size = 43; /* BIP 34: height in coinbase */ - if (work->height >= 1 && work->height <= 16) { + if (work->height >= 1 && work->height <= 16) + { /* Use OP_1-OP_16 to conform to Bitcoin's implementation. */ cbtx[42] = work->height + 0x50; cbtx[cbtx_size++] = 0x00; /* OP_0; pads to 2 bytes */ - } else { - for (n = work->height; n; n >>= 8) { + } + else + { + for (n = work->height; n; n >>= 8) + { cbtx[cbtx_size++] = n & 0xff; if (n < 0x100 && n >= 0x80) cbtx[cbtx_size++] = 0; } cbtx[42] = cbtx_size - 43; } - cbtx[41] = cbtx_size - 42; /* scriptsig length */ - le32enc((uint32_t *)(cbtx+cbtx_size), 0xffffffff); /* sequence */ + cbtx[41] = cbtx_size - 42; /* scriptsig length */ + le32enc((uint32_t *)(cbtx + cbtx_size), 0xffffffff); /* sequence */ cbtx_size += 4; - cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */ - le32enc((uint32_t *)(cbtx+cbtx_size), (uint32_t)cbvalue); /* value */ - le32enc((uint32_t *)(cbtx+cbtx_size+4), cbvalue >> 32); + cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */ + le32enc((uint32_t *)(cbtx + cbtx_size), (uint32_t)cbvalue); /* value */ + le32enc((uint32_t *)(cbtx + cbtx_size + 4), cbvalue >> 32); cbtx_size += 8; cbtx[cbtx_size++] = pk_script_size; /* txout-script length */ - memcpy(cbtx+cbtx_size, pk_script, pk_script_size); + memcpy(cbtx + cbtx_size, pk_script, pk_script_size); cbtx_size += pk_script_size; - if (segwit) { - unsigned char (*wtree)[32] = calloc(tx_count + 2, 32); - memset(cbtx+cbtx_size, 0, 8); /* value */ + if (segwit) + { + unsigned char(*wtree)[32] = calloc(tx_count + 2, 32); + memset(cbtx + cbtx_size, 0, 8); /* value */ cbtx_size += 8; - cbtx[cbtx_size++] = 38; /* txout-script length */ + cbtx[cbtx_size++] = 38; /* txout-script length */ cbtx[cbtx_size++] = 0x6a; /* txout-script */ cbtx[cbtx_size++] = 0x24; cbtx[cbtx_size++] = 0xaa; cbtx[cbtx_size++] = 0x21; cbtx[cbtx_size++] = 0xa9; cbtx[cbtx_size++] = 0xed; - for (i = 0; i < tx_count; i++) { + for (i = 0; i < tx_count; i++) + { const json_t *tx = json_array_get(txa, i); const json_t *hash = json_object_get(tx, "hash"); - if (!hash || !hex2bin(wtree[1+i], json_string_value(hash), 32)) { + if (!hash || !hex2bin(wtree[1 + i], json_string_value(hash), 32)) + { applog(LOG_ERR, "JSON invalid transaction hash"); free(wtree); goto out; } - memrev(wtree[1+i], 32); + memrev(wtree[1 + i], 32); } n = tx_count + 1; - while (n > 1) { + while (n > 1) + { if (n % 2) - memcpy(wtree[n], wtree[n-1], 32); + memcpy(wtree[n], wtree[n - 1], 32); n = (n + 1) / 2; for (i = 0; i < n; i++) - sha256d(wtree[i], wtree[2*i], 64); + sha256d(wtree[i], wtree[2 * i], 64); } - memset(wtree[1], 0, 32); /* witness reserved value = 0 */ - sha256d(cbtx+cbtx_size, wtree[0], 64); + memset(wtree[1], 0, 32); /* witness reserved value = 0 */ + sha256d(cbtx + cbtx_size, wtree[0], 64); cbtx_size += 32; free(wtree); } - le32enc((uint32_t *)(cbtx+cbtx_size), 0); /* lock time */ + le32enc((uint32_t *)(cbtx + cbtx_size), 0); /* lock time */ cbtx_size += 4; coinbase_append = true; } - if (coinbase_append) { + if (coinbase_append) + { unsigned char xsig[100]; int xsig_len = 0; - if (*coinbase_sig) { + if (*coinbase_sig) + { n = strlen(coinbase_sig); - if (cbtx[41] + xsig_len + n <= 100) { - memcpy(xsig+xsig_len, coinbase_sig, n); + if (cbtx[41] + xsig_len + n <= 100) + { + memcpy(xsig + xsig_len, coinbase_sig, n); xsig_len += n; - } else { + } + else + { applog(LOG_WARNING, "Signature does not fit in coinbase, skipping"); } } tmp = json_object_get(val, "coinbaseaux"); - if (tmp && json_is_object(tmp)) { + if (tmp && json_is_object(tmp)) + { void *iter = json_object_iter(tmp); - while (iter) { + while (iter) + { unsigned char buf[100]; const char *s = json_string_value(json_object_iter_value(iter)); n = s ? strlen(s) / 2 : 0; - if (!s || n > 100 || !hex2bin(buf, s, n)) { + if (!s || n > 100 || !hex2bin(buf, s, n)) + { applog(LOG_ERR, "JSON invalid coinbaseaux"); break; } - if (cbtx[41] + xsig_len + n <= 100) { - memcpy(xsig+xsig_len, buf, n); + if (cbtx[41] + xsig_len + n <= 100) + { + memcpy(xsig + xsig_len, buf, n); xsig_len += n; } iter = json_object_iter_next(tmp, iter); } } - if (xsig_len) { + if (xsig_len) + { unsigned char *ssig_end = cbtx + 42 + cbtx[41]; - int push_len = cbtx[41] + xsig_len < 76 ? 1 : - cbtx[41] + 2 + xsig_len > 100 ? 0 : 2; + int push_len = cbtx[41] + xsig_len < 76 ? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 + : 2; n = xsig_len + push_len; memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]); cbtx[41] += n; @@ -578,7 +632,7 @@ static bool gbt_work_decode(const json_t *val, struct work *work) n = varint_encode(txc_vi, 1 + tx_count); work->txs = malloc(2 * (n + cbtx_size + tx_size) + 1); bin2hex(work->txs, txc_vi, n); - bin2hex(work->txs + 2*n, cbtx, cbtx_size); + bin2hex(work->txs + 2 * n, cbtx, cbtx_size); char *txs_end = work->txs + strlen(work->txs); /* generate merkle root */ @@ -586,45 +640,56 @@ static bool gbt_work_decode(const json_t *val, struct work *work) size_t tx_buf_size = 32 * 1024; tx = malloc(tx_buf_size); sha256d(merkle_tree[0], cbtx, cbtx_size); - for (i = 0; i < tx_count; i++) { + for (i = 0; i < tx_count; i++) + { tmp = json_array_get(txa, i); const char *tx_hex = json_string_value(json_object_get(tmp, "data")); const size_t tx_hex_len = tx_hex ? strlen(tx_hex) : 0; const int tx_size = tx_hex_len / 2; - if (segwit) { + if (segwit) + { const char *txid = json_string_value(json_object_get(tmp, "txid")); - if (!txid || !hex2bin(merkle_tree[1 + i], txid, 32)) { + if (!txid || !hex2bin(merkle_tree[1 + i], txid, 32)) + { applog(LOG_ERR, "JSON invalid transaction txid"); goto out; } memrev(merkle_tree[1 + i], 32); - } else { - if (tx_size > tx_buf_size) { + } + else + { + if (tx_size > tx_buf_size) + { free(tx); tx_buf_size = tx_size * 2; tx = malloc(tx_buf_size); } - if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) { + if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) + { applog(LOG_ERR, "JSON invalid transactions"); goto out; } sha256d(merkle_tree[1 + i], tx, tx_size); } - if (!submit_coinbase) { + if (!submit_coinbase) + { strcpy(txs_end, tx_hex); txs_end += tx_hex_len; } } - free(tx); tx = NULL; + free(tx); + tx = NULL; n = 1 + tx_count; - while (n > 1) { - if (n % 2) { - memcpy(merkle_tree[n], merkle_tree[n-1], 32); + while (n > 1) + { + if (n % 2) + { + memcpy(merkle_tree[n], merkle_tree[n - 1], 32); ++n; } n /= 2; for (i = 0; i < n; i++) - sha256d(merkle_tree[i], merkle_tree[2*i], 64); + sha256d(merkle_tree[i], merkle_tree[2 * i], 64); } /* assemble block header */ @@ -639,7 +704,8 @@ static bool gbt_work_decode(const json_t *val, struct work *work) work->data[20] = 0x80000000; work->data[31] = 0x00000280; - if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) { + if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) + { applog(LOG_ERR, "JSON invalid target"); goto out; } @@ -647,8 +713,10 @@ static bool gbt_work_decode(const json_t *val, struct work *work) work->target[7 - i] = be32dec(target + i); tmp = json_object_get(val, "workid"); - if (tmp) { - if (!json_is_string(tmp)) { + if (tmp) + { + if (!json_is_string(tmp)) + { applog(LOG_ERR, "JSON invalid workid"); goto out; } @@ -657,10 +725,12 @@ static bool gbt_work_decode(const json_t *val, struct work *work) /* Long polling */ tmp = json_object_get(val, "longpollid"); - if (want_longpoll && json_is_string(tmp)) { + if (want_longpoll && json_is_string(tmp)) + { free(lp_id); lp_id = strdup(json_string_value(tmp)); - if (!have_longpoll) { + if (!have_longpoll) + { char *lp_uri; tmp = json_object_get(val, "longpolluri"); lp_uri = strdup(json_is_string(tmp) ? json_string_value(tmp) : rpc_url); @@ -690,14 +760,14 @@ static void share_result(int result, const char *reason) hashrate += thr_hashrates[i]; result ? accepted_count++ : rejected_count++; pthread_mutex_unlock(&stats_lock); - + sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s", - accepted_count, - accepted_count + rejected_count, - 100. * accepted_count / (accepted_count + rejected_count), - s, - result ? "(yay!!!)" : "(booooo)"); + accepted_count, + accepted_count + rejected_count, + 100. * accepted_count / (accepted_count + rejected_count), + s, + result ? "(yay!!!)" : "(booooo)"); if (opt_debug && reason) applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason); @@ -712,13 +782,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work) bool rc = false; /* pass if the previous hash is not the current previous hash */ - if (!submit_old && memcmp(work->data + 1, g_work.data + 1, 32)) { + if (!submit_old && memcmp(work->data + 1, g_work.data + 1, 32)) + { if (opt_debug) applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); return true; } - if (have_stratum) { + if (have_stratum) + { uint32_t ntime, nonce; char ntimestr[9], noncestr[9], *xnonce2str, *req; @@ -729,53 +801,63 @@ static bool submit_upstream_work(CURL *curl, struct work *work) xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len); req = malloc(256 + strlen(rpc_user) + strlen(work->job_id) + 2 * work->xnonce2_len); sprintf(req, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr); + "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", + rpc_user, work->job_id, xnonce2str, ntimestr, noncestr); free(xnonce2str); rc = stratum_send_line(&stratum, req); free(req); - if (unlikely(!rc)) { + if (unlikely(!rc)) + { applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); goto out; } - } else if (work->txs) { + } + else if (work->txs) + { char *req; for (i = 0; i < ARRAY_SIZE(work->data); i++) be32enc(work->data + i, work->data[i]); bin2hex(data_str, (unsigned char *)work->data, 80); - if (work->workid) { + if (work->workid) + { char *params; val = json_object(); json_object_set_new(val, "workid", json_string(work->workid)); params = json_dumps(val, 0); json_decref(val); - req = malloc(128 + 2*80 + strlen(work->txs) + strlen(params)); + req = malloc(128 + 2 * 80 + strlen(work->txs) + strlen(params)); sprintf(req, - "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n", - data_str, work->txs, params); + "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n", + data_str, work->txs, params); free(params); - } else { - req = malloc(128 + 2*80 + strlen(work->txs)); + } + else + { + req = malloc(128 + 2 * 80 + strlen(work->txs)); sprintf(req, - "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n", - data_str, work->txs); + "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n", + data_str, work->txs); } val = json_rpc_call(curl, rpc_url, rpc_userpass, req, NULL, 0); free(req); - if (unlikely(!val)) { + if (unlikely(!val)) + { applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); goto out; } res = json_object_get(val, "result"); - if (json_is_object(res)) { + if (json_is_object(res)) + { char *res_str; bool sumres = false; void *iter = json_object_iter(res); - while (iter) { - if (json_is_null(json_object_iter_value(iter))) { + while (iter) + { + if (json_is_null(json_object_iter_value(iter))) + { sumres = true; break; } @@ -784,11 +866,14 @@ static bool submit_upstream_work(CURL *curl, struct work *work) res_str = json_dumps(res, 0); share_result(sumres, res_str); free(res_str); - } else + } + else share_result(json_is_null(res), json_string_value(res)); json_decref(val); - } else { + } + else + { /* build hex string */ for (i = 0; i < ARRAY_SIZE(work->data); i++) le32enc(work->data + i, work->data[i]); @@ -796,12 +881,13 @@ static bool submit_upstream_work(CURL *curl, struct work *work) /* build JSON-RPC request */ sprintf(s, - "{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n", - data_str); + "{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n", + data_str); /* issue JSON-RPC request */ val = json_rpc_call(curl, rpc_url, rpc_userpass, s, NULL, 0); - if (unlikely(!val)) { + if (unlikely(!val)) + { applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); goto out; } @@ -820,17 +906,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work) } static const char *getwork_req = - "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; + "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]" #define GBT_RULES "[\"segwit\"]" static const char *gbt_req = - "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " - GBT_CAPABILITIES ", \"rules\": " GBT_RULES "}], \"id\":0}\r\n"; + "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " GBT_CAPABILITIES ", \"rules\": " GBT_RULES "}], \"id\":0}\r\n"; static const char *gbt_lp_req = - "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " - GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; + "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; static bool get_upstream_work(CURL *curl, struct work *work) { @@ -842,24 +926,27 @@ static bool get_upstream_work(CURL *curl, struct work *work) start: gettimeofday(&tv_start, NULL); val = json_rpc_call(curl, rpc_url, rpc_userpass, - have_gbt ? gbt_req : getwork_req, - &err, have_gbt ? JSON_RPC_QUIET_404 : 0); + have_gbt ? gbt_req : getwork_req, + &err, have_gbt ? JSON_RPC_QUIET_404 : 0); gettimeofday(&tv_end, NULL); - if (have_stratum) { + if (have_stratum) + { if (val) json_decref(val); return true; } - if (!have_gbt && !allow_getwork) { + if (!have_gbt && !allow_getwork) + { applog(LOG_ERR, "No usable protocol"); if (val) json_decref(val); return false; } - if (have_gbt && allow_getwork && !val && err == CURLE_OK) { + if (have_gbt && allow_getwork && !val && err == CURLE_OK) + { applog(LOG_INFO, "getblocktemplate failed, falling back to getwork"); have_gbt = false; goto start; @@ -868,19 +955,23 @@ static bool get_upstream_work(CURL *curl, struct work *work) if (!val) return false; - if (have_gbt) { + if (have_gbt) + { rc = gbt_work_decode(json_object_get(val, "result"), work); - if (!have_gbt) { + if (!have_gbt) + { json_decref(val); goto start; } - } else + } + else rc = work_decode(json_object_get(val, "result"), work); - if (opt_debug && rc) { + if (opt_debug && rc) + { timeval_subtract(&diff, &tv_end, &tv_start); applog(LOG_DEBUG, "DEBUG: got new work in %d ms", - diff.tv_sec * 1000 + diff.tv_usec / 1000); + diff.tv_sec * 1000 + diff.tv_usec / 1000); } json_decref(val); @@ -893,7 +984,8 @@ static void workio_cmd_free(struct workio_cmd *wc) if (!wc) return; - switch (wc->cmd) { + switch (wc->cmd) + { case WC_SUBMIT_WORK: work_free(wc->u.work); free(wc->u.work); @@ -902,7 +994,7 @@ static void workio_cmd_free(struct workio_cmd *wc) break; } - memset(wc, 0, sizeof(*wc)); /* poison */ + memset(wc, 0, sizeof(*wc)); /* poison */ free(wc); } @@ -916,8 +1008,10 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) return false; /* obtain new work from bitcoin via JSON-RPC */ - while (!get_upstream_work(curl, ret_work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + while (!get_upstream_work(curl, ret_work)) + { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) + { applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); free(ret_work); return false; @@ -925,7 +1019,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) /* pause, then restart work-request loop */ applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", - opt_fail_pause); + opt_fail_pause); sleep(opt_fail_pause); } @@ -941,15 +1035,17 @@ static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) int failures = 0; /* submit solution to bitcoin via JSON-RPC */ - while (!submit_upstream_work(curl, wc->u.work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + while (!submit_upstream_work(curl, wc->u.work)) + { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) + { applog(LOG_ERR, "...terminating workio thread"); return false; } /* pause, then restart work-request loop */ applog(LOG_ERR, "...retry after %d seconds", - opt_fail_pause); + opt_fail_pause); sleep(opt_fail_pause); } @@ -963,23 +1059,27 @@ static void *workio_thread(void *userdata) bool ok = true; curl = curl_easy_init(); - if (unlikely(!curl)) { + if (unlikely(!curl)) + { applog(LOG_ERR, "CURL initialization failed"); return NULL; } - while (ok) { + while (ok) + { struct workio_cmd *wc; /* wait for workio_cmd sent to us, on our queue */ wc = tq_pop(mythr->q, NULL); - if (!wc) { + if (!wc) + { ok = false; break; } /* process workio_cmd */ - switch (wc->cmd) { + switch (wc->cmd) + { case WC_GET_WORK: ok = workio_get_work(wc, curl); break; @@ -987,7 +1087,7 @@ static void *workio_thread(void *userdata) ok = workio_submit_work(wc, curl); break; - default: /* should never happen */ + default: /* should never happen */ ok = false; break; } @@ -1006,7 +1106,8 @@ static bool get_work(struct thr_info *thr, struct work *work) struct workio_cmd *wc; struct work *work_heap; - if (opt_benchmark) { + if (opt_benchmark) + { memset(work->data, 0x55, 76); work->data[17] = swab32(time(NULL)); memset(work->data + 19, 0x00, 52); @@ -1025,7 +1126,8 @@ static bool get_work(struct thr_info *thr, struct work *work) wc->thr = thr; /* send work request to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) { + if (!tq_push(thr_info[work_thr_id].q, wc)) + { workio_cmd_free(wc); return false; } @@ -1045,7 +1147,7 @@ static bool get_work(struct thr_info *thr, struct work *work) static bool submit_work(struct thr_info *thr, const struct work *work_in) { struct workio_cmd *wc; - + /* fill out work request message */ wc = calloc(1, sizeof(*wc)); if (!wc) @@ -1085,13 +1187,15 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) /* Generate merkle root */ sha256d(merkle_root, sctx->job.coinbase, sctx->job.coinbase_size); - for (i = 0; i < sctx->job.merkle_count; i++) { + for (i = 0; i < sctx->job.merkle_count; i++) + { memcpy(merkle_root + 32, sctx->job.merkle[i], 32); sha256d(merkle_root, merkle_root, 64); } - + /* Increment extranonce2 */ - for (i = 0; i < sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); + for (i = 0; i < sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++) + ; /* Assemble block header */ memset(work->data, 0, 128); @@ -1107,10 +1211,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) pthread_mutex_unlock(&sctx->work_lock); - if (opt_debug) { + if (opt_debug) + { char *xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len); applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x", - work->job_id, xnonce2str, swab32(work->data[17])); + work->job_id, xnonce2str, swab32(work->data[17])); free(xnonce2str); } @@ -1134,80 +1239,95 @@ static void *miner_thread(void *userdata) /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE * and if that fails, then SCHED_BATCH. No need for this to be an * error if it fails */ - if (!opt_benchmark) { + if (!opt_benchmark) + { setpriority(PRIO_PROCESS, 0, 19); drop_policy(); } /* Cpu affinity only makes sense if the number of threads is a multiple * of the number of CPUs */ - if (num_processors > 1 && opt_n_threads % num_processors == 0) { + if (num_processors > 1 && opt_n_threads % num_processors == 0) + { if (!opt_quiet) applog(LOG_INFO, "Binding thread %d to cpu %d", - thr_id, thr_id % num_processors); + thr_id, thr_id % num_processors); affine_to_cpu(thr_id, thr_id % num_processors); } - - if (opt_algo == ALGO_SCRYPT) { + + if (opt_algo == ALGO_SCRYPT) + { scratchbuf = scrypt_buffer_alloc(opt_scrypt_n); - if (!scratchbuf) { + if (!scratchbuf) + { applog(LOG_ERR, "scrypt buffer allocation failed"); pthread_mutex_lock(&applog_lock); exit(1); } } - while (1) { + while (1) + { unsigned long hashes_done; struct timeval tv_start, tv_end, diff; int64_t max64; int rc; - if (have_stratum) { + if (have_stratum) + { while (time(NULL) >= g_work_time + 120) sleep(1); pthread_mutex_lock(&g_work_lock); if (work.data[19] >= end_nonce && !memcmp(work.data, g_work.data, 76)) stratum_gen_work(&stratum, &g_work); - } else { + } + else + { int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); if (!have_stratum && - (time(NULL) - g_work_time >= min_scantime || - work.data[19] >= end_nonce)) { + (time(NULL) - g_work_time >= min_scantime || + work.data[19] >= end_nonce)) + { work_free(&g_work); - if (unlikely(!get_work(mythr, &g_work))) { + if (unlikely(!get_work(mythr, &g_work))) + { applog(LOG_ERR, "work retrieval failed, exiting " - "mining thread %d", mythr->id); + "mining thread %d", + mythr->id); pthread_mutex_unlock(&g_work_lock); goto out; } g_work_time = have_stratum ? 0 : time(NULL); } - if (have_stratum) { + if (have_stratum) + { pthread_mutex_unlock(&g_work_lock); continue; } } - if (memcmp(work.data, g_work.data, 76)) { + if (memcmp(work.data, g_work.data, 76)) + { work_free(&work); work_copy(&work, &g_work); work.data[19] = 0xffffffffU / opt_n_threads * thr_id; - } else + } + else work.data[19]++; pthread_mutex_unlock(&g_work_lock); work_restart[thr_id].restart = 0; - + /* adjust max_nonce to meet target scan time */ if (have_stratum) max64 = LP_SCANTIME; else - max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - - time(NULL); + max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - time(NULL); max64 *= thr_hashrates[thr_id]; - if (max64 <= 0) { - switch (opt_algo) { + if (max64 <= 0) + { + switch (opt_algo) + { case ALGO_SCRYPT: max64 = opt_scrypt_n < 16 ? 0x3ffff : 0x3fffff / opt_scrypt_n; break; @@ -1220,20 +1340,31 @@ static void *miner_thread(void *userdata) max_nonce = end_nonce; else max_nonce = work.data[19] + max64; - + hashes_done = 0; gettimeofday(&tv_start, NULL); /* scan nonces for a proof-of-work hash */ - switch (opt_algo) { + switch (opt_algo) + { case ALGO_SCRYPT: rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, - max_nonce, &hashes_done, opt_scrypt_n); + max_nonce, &hashes_done, opt_scrypt_n); break; case ALGO_SHA256D: rc = scanhash_sha256d(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); + break; + + case ALGO_SHA256: + rc = scanhash_sha256(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_SHA256ET10: + rc = scanhash_sha256ET10(thr_id, work.data, work.target, + max_nonce, &hashes_done); break; default: @@ -1244,23 +1375,27 @@ static void *miner_thread(void *userdata) /* record scanhash elapsed time */ gettimeofday(&tv_end, NULL); timeval_subtract(&diff, &tv_end, &tv_start); - if (diff.tv_usec || diff.tv_sec) { + if (diff.tv_usec || diff.tv_sec) + { pthread_mutex_lock(&stats_lock); thr_hashrates[thr_id] = - hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); + hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); pthread_mutex_unlock(&stats_lock); } - if (!opt_quiet) { + if (!opt_quiet) + { sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", - 1e-3 * thr_hashrates[thr_id]); + 1e-3 * thr_hashrates[thr_id]); applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s", - thr_id, hashes_done, s); + thr_id, hashes_done, s); } - if (opt_benchmark && thr_id == opt_n_threads - 1) { + if (opt_benchmark && thr_id == opt_n_threads - 1) + { double hashrate = 0.; for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++) hashrate += thr_hashrates[i]; - if (i == opt_n_threads) { + if (i == opt_n_threads) + { sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); applog(LOG_INFO, "Total: %s khash/s", s); } @@ -1293,7 +1428,8 @@ static void *longpoll_thread(void *userdata) bool need_slash = false; curl = curl_easy_init(); - if (unlikely(!curl)) { + if (unlikely(!curl)) + { applog(LOG_ERR, "CURL initialization failed"); goto out; } @@ -1304,13 +1440,15 @@ static void *longpoll_thread(void *userdata) goto out; /* full URL */ - if (strstr(hdr_path, "://")) { + if (strstr(hdr_path, "://")) + { lp_url = hdr_path; hdr_path = NULL; } - + /* absolute path, on current server */ - else { + else + { copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; if (rpc_url[strlen(rpc_url) - 1] != '/') need_slash = true; @@ -1324,25 +1462,29 @@ static void *longpoll_thread(void *userdata) applog(LOG_INFO, "Long-polling activated for %s", lp_url); - while (1) { + while (1) + { json_t *val, *res, *soval; char *req = NULL; int err; - if (have_gbt) { + if (have_gbt) + { req = malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1); sprintf(req, gbt_lp_req, lp_id); } val = json_rpc_call(curl, lp_url, rpc_userpass, - req ? req : getwork_req, &err, - JSON_RPC_LONGPOLL); + req ? req : getwork_req, &err, + JSON_RPC_LONGPOLL); free(req); - if (have_stratum) { + if (have_stratum) + { if (val) json_decref(val); goto out; } - if (likely(val)) { + if (likely(val)) + { bool rc; applog(LOG_INFO, "LONGPOLL pushed new work"); res = json_object_get(val, "result"); @@ -1354,19 +1496,25 @@ static void *longpoll_thread(void *userdata) rc = gbt_work_decode(res, &g_work); else rc = work_decode(res, &g_work); - if (rc) { + if (rc) + { time(&g_work_time); restart_threads(); } pthread_mutex_unlock(&g_work_lock); json_decref(val); - } else { + } + else + { pthread_mutex_lock(&g_work_lock); g_work_time -= LP_SCANTIME; pthread_mutex_unlock(&g_work_lock); - if (err == CURLE_OPERATION_TIMEDOUT) { + if (err == CURLE_OPERATION_TIMEDOUT) + { restart_threads(); - } else { + } + else + { have_longpoll = false; restart_threads(); free(hdr_path); @@ -1395,7 +1543,8 @@ static bool stratum_handle_response(char *buf) bool ret = false; val = JSON_LOADS(buf, &err); - if (!val) { + if (!val) + { applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); goto out; } @@ -1408,7 +1557,7 @@ static bool stratum_handle_response(char *buf) goto out; share_result(json_is_true(res_val), - err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); + err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); ret = true; out: @@ -1428,20 +1577,24 @@ static void *stratum_thread(void *userdata) goto out; applog(LOG_INFO, "Starting Stratum on %s", stratum.url); - while (1) { + while (1) + { int failures = 0; - while (!stratum.curl) { + while (!stratum.curl) + { pthread_mutex_lock(&g_work_lock); g_work_time = 0; pthread_mutex_unlock(&g_work_lock); restart_threads(); if (!stratum_connect(&stratum, stratum.url) || - !stratum_subscribe(&stratum) || - !stratum_authorize(&stratum, rpc_user, rpc_pass)) { + !stratum_subscribe(&stratum) || + !stratum_authorize(&stratum, rpc_user, rpc_pass)) + { stratum_disconnect(&stratum); - if (opt_retries >= 0 && ++failures > opt_retries) { + if (opt_retries >= 0 && ++failures > opt_retries) + { applog(LOG_ERR, "...terminating workio thread"); tq_push(thr_info[work_thr_id].q, NULL); goto out; @@ -1452,23 +1605,28 @@ static void *stratum_thread(void *userdata) } if (stratum.job.job_id && - (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id))) { + (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id))) + { pthread_mutex_lock(&g_work_lock); stratum_gen_work(&stratum, &g_work); time(&g_work_time); pthread_mutex_unlock(&g_work_lock); - if (stratum.job.clean) { + if (stratum.job.clean) + { applog(LOG_INFO, "Stratum requested work restart"); restart_threads(); } } - - if (!stratum_socket_full(&stratum, 120)) { + + if (!stratum_socket_full(&stratum, 120)) + { applog(LOG_ERR, "Stratum connection timed out"); s = NULL; - } else + } + else s = stratum_recv_line(&stratum); - if (!s) { + if (!s) + { stratum_disconnect(&stratum); applog(LOG_ERR, "Stratum connection interrupted"); continue; @@ -1486,47 +1644,47 @@ static void show_version_and_exit(void) { printf(PACKAGE_STRING "\n built on " __DATE__ "\n features:" #if defined(USE_ASM) && defined(__i386__) - " i386" + " i386" #endif #if defined(USE_ASM) && defined(__x86_64__) - " x86_64" - " PHE" + " x86_64" + " PHE" #endif #if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__)) - " SSE2" + " SSE2" #endif #if defined(__x86_64__) && defined(USE_AVX) - " AVX" + " AVX" #endif #if defined(__x86_64__) && defined(USE_AVX2) - " AVX2" + " AVX2" #endif #if defined(__x86_64__) && defined(USE_XOP) - " XOP" + " XOP" #endif #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) - " ARM" + " ARM" #if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) - " ARMv5E" + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) + " ARMv5E" #endif #if defined(__ARM_NEON__) - " NEON" + " NEON" #endif #endif #if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) - " PowerPC" + " PowerPC" #if defined(__ALTIVEC__) - " AltiVec" + " AltiVec" #endif #endif - "\n"); + "\n"); printf("%s\n", curl_version()); #ifdef JANSSON_VERSION @@ -1546,8 +1704,10 @@ static void show_usage_and_exit(int status) static void strhide(char *s) { - if (*s) *s++ = 'x'; - while (*s) *s++ = '\0'; + if (*s) + *s++ = 'x'; + while (*s) + *s++ = '\0'; } static void parse_config(json_t *config, char *pname, char *ref); @@ -1557,19 +1717,24 @@ static void parse_arg(int key, char *arg, char *pname) char *p; int v, i; - switch(key) { + switch (key) + { case 'a': - for (i = 0; i < ARRAY_SIZE(algo_names); i++) { + for (i = 0; i < ARRAY_SIZE(algo_names); i++) + { v = strlen(algo_names[i]); - if (!strncmp(arg, algo_names[i], v)) { - if (arg[v] == '\0') { + if (!strncmp(arg, algo_names[i], v)) + { + if (arg[v] == '\0') + { opt_algo = i; break; } - if (arg[v] == ':' && i == ALGO_SCRYPT) { + if (arg[v] == ':' && i == ALGO_SCRYPT) + { char *ep; - v = strtol(arg+v+1, &ep, 10); - if (*ep || v & (v-1) || v < 2) + v = strtol(arg + v + 1, &ep, 10); + if (*ep || v & (v - 1) || v < 2) continue; opt_algo = i; opt_scrypt_n = v; @@ -1577,24 +1742,27 @@ static void parse_arg(int key, char *arg, char *pname) } } } - if (i == ARRAY_SIZE(algo_names)) { + if (i == ARRAY_SIZE(algo_names)) + { fprintf(stderr, "%s: unknown algorithm -- '%s'\n", - pname, arg); + pname, arg); show_usage_and_exit(1); } break; case 'B': opt_background = true; break; - case 'c': { + case 'c': + { json_error_t err; json_t *config = JSON_LOAD_FILE(arg, &err); - if (!json_is_object(config)) { + if (!json_is_object(config)) + { if (err.line < 0) fprintf(stderr, "%s: %s\n", pname, err.text); else fprintf(stderr, "%s: %s:%d: %s\n", - pname, arg, err.line, err.text); + pname, arg, err.line, err.text); exit(1); } parse_config(config, pname, arg); @@ -1617,31 +1785,31 @@ static void parse_arg(int key, char *arg, char *pname) break; case 'r': v = atoi(arg); - if (v < -1 || v > 9999) /* sanity check */ + if (v < -1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_retries = v; break; case 'R': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if (v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_fail_pause = v; break; case 's': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if (v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_scantime = v; break; case 'T': v = atoi(arg); - if (v < 1 || v > 99999) /* sanity check */ + if (v < 1 || v > 99999) /* sanity check */ show_usage_and_exit(1); opt_timeout = v; break; case 't': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if (v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_n_threads = v; break; @@ -1649,15 +1817,18 @@ static void parse_arg(int key, char *arg, char *pname) free(rpc_user); rpc_user = strdup(arg); break; - case 'o': { /* --url */ + case 'o': + { /* --url */ char *ap, *hp; ap = strstr(arg, "://"); ap = ap ? ap + 3 : arg; hp = strrchr(arg, '@'); - if (hp) { + if (hp) + { *hp = '\0'; p = strchr(ap, ':'); - if (p) { + if (p) + { free(rpc_userpass); rpc_userpass = strdup(ap); free(rpc_user); @@ -1665,34 +1836,43 @@ static void parse_arg(int key, char *arg, char *pname) strncpy(rpc_user, ap, p - ap); free(rpc_pass); rpc_pass = strdup(++p); - if (*p) *p++ = 'x'; + if (*p) + *p++ = 'x'; v = strlen(hp + 1) + 1; memmove(p + 1, hp + 1, v); memset(p + v, 0, hp - p); hp = p; - } else { + } + else + { free(rpc_user); rpc_user = strdup(ap); } *hp++ = '@'; - } else + } + else hp = ap; - if (ap != arg) { + if (ap != arg) + { if (strncasecmp(arg, "http://", 7) && - strncasecmp(arg, "https://", 8) && - strncasecmp(arg, "stratum+tcp://", 14) && - strncasecmp(arg, "stratum+tcps://", 15)) { + strncasecmp(arg, "https://", 8) && + strncasecmp(arg, "stratum+tcp://", 14) && + strncasecmp(arg, "stratum+tcps://", 15)) + { fprintf(stderr, "%s: unknown protocol -- '%s'\n", - pname, arg); + pname, arg); show_usage_and_exit(1); } free(rpc_url); rpc_url = strdup(arg); strcpy(rpc_url + (ap - arg), hp); - } else { - if (*hp == '\0' || *hp == '/') { + } + else + { + if (*hp == '\0' || *hp == '/') + { fprintf(stderr, "%s: invalid URL -- '%s'\n", - pname, arg); + pname, arg); show_usage_and_exit(1); } free(rpc_url); @@ -1702,11 +1882,12 @@ static void parse_arg(int key, char *arg, char *pname) have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); break; } - case 'O': /* --userpass */ + case 'O': /* --userpass */ p = strchr(arg, ':'); - if (!p) { + if (!p) + { fprintf(stderr, "%s: invalid username:password pair -- '%s'\n", - pname, arg); + pname, arg); show_usage_and_exit(1); } free(rpc_userpass); @@ -1718,7 +1899,7 @@ static void parse_arg(int key, char *arg, char *pname) rpc_pass = strdup(++p); strhide(p); break; - case 'x': /* --proxy */ + case 'x': /* --proxy */ if (!strncasecmp(arg, "socks4://", 9)) opt_proxy_type = CURLPROXY_SOCKS4; else if (!strncasecmp(arg, "socks5://", 9)) @@ -1759,16 +1940,18 @@ static void parse_arg(int key, char *arg, char *pname) case 1011: have_gbt = false; break; - case 1013: /* --coinbase-addr */ + case 1013: /* --coinbase-addr */ pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg); - if (!pk_script_size) { + if (!pk_script_size) + { fprintf(stderr, "%s: invalid address -- '%s'\n", - pname, arg); + pname, arg); show_usage_and_exit(1); } break; - case 1015: /* --coinbase-sig */ - if (strlen(arg) + 1 > sizeof(coinbase_sig)) { + case 1015: /* --coinbase-sig */ + if (strlen(arg) + 1 > sizeof(coinbase_sig)) + { fprintf(stderr, "%s: coinbase signature too long\n", pname); show_usage_and_exit(1); } @@ -1792,7 +1975,8 @@ static void parse_config(json_t *config, char *pname, char *ref) char *s; json_t *val; - for (i = 0; i < ARRAY_SIZE(options); i++) { + for (i = 0; i < ARRAY_SIZE(options); i++) + { if (!options[i].name) break; @@ -1800,10 +1984,12 @@ static void parse_config(json_t *config, char *pname, char *ref) if (!val) continue; - if (options[i].has_arg && json_is_string(val)) { - if (!strcmp(options[i].name, "config")) { + if (options[i].has_arg && json_is_string(val)) + { + if (!strcmp(options[i].name, "config")) + { fprintf(stderr, "%s: %s: option '%s' not allowed here\n", - pname, ref, options[i].name); + pname, ref, options[i].name); exit(1); } s = strdup(json_string_value(val)); @@ -1811,11 +1997,15 @@ static void parse_config(json_t *config, char *pname, char *ref) break; parse_arg(options[i].val, s, pname); free(s); - } else if (!options[i].has_arg && json_is_true(val)) { + } + else if (!options[i].has_arg && json_is_true(val)) + { parse_arg(options[i].val, "", pname); - } else { + } + else + { fprintf(stderr, "%s: invalid argument for option '%s'\n", - pname, options[i].name); + pname, options[i].name); exit(1); } } @@ -1825,7 +2015,8 @@ static void parse_cmdline(int argc, char *argv[]) { int key; - while (1) { + while (1) + { #if HAVE_GETOPT_LONG key = getopt_long(argc, argv, short_options, options, NULL); #else @@ -1836,9 +2027,10 @@ static void parse_cmdline(int argc, char *argv[]) parse_arg(key, optarg, argv[0]); } - if (optind < argc) { + if (optind < argc) + { fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n", - argv[0], argv[optind]); + argv[0], argv[optind]); show_usage_and_exit(1); } } @@ -1846,7 +2038,8 @@ static void parse_cmdline(int argc, char *argv[]) #ifndef WIN32 static void signal_handler(int sig) { - switch (sig) { + switch (sig) + { case SIGHUP: applog(LOG_INFO, "SIGHUP received"); break; @@ -1874,12 +2067,14 @@ int main(int argc, char *argv[]) /* parse command line */ parse_cmdline(argc, argv); - if (!opt_benchmark && !rpc_url) { + if (!opt_benchmark && !rpc_url) + { fprintf(stderr, "%s: no URL supplied\n", argv[0]); show_usage_and_exit(1); } - if (!rpc_userpass) { + if (!rpc_userpass) + { rpc_userpass = malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); if (!rpc_userpass) return 1; @@ -1893,19 +2088,23 @@ int main(int argc, char *argv[]) pthread_mutex_init(&stratum.work_lock, NULL); flags = opt_benchmark || (strncasecmp(rpc_url, "https://", 8) && - strncasecmp(rpc_url, "stratum+tcps://", 15)) - ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) - : CURL_GLOBAL_ALL; - if (curl_global_init(flags)) { + strncasecmp(rpc_url, "stratum+tcps://", 15)) + ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) + : CURL_GLOBAL_ALL; + if (curl_global_init(flags)) + { applog(LOG_ERR, "CURL initialization failed"); return 1; } #ifndef WIN32 - if (opt_background) { + if (opt_background) + { i = fork(); - if (i < 0) exit(1); - if (i > 0) exit(0); + if (i < 0) + exit(1); + if (i > 0) + exit(0); i = setsid(); if (i < 0) applog(LOG_ERR, "setsid() failed (errno = %d)", errno); @@ -1925,7 +2124,7 @@ int main(int argc, char *argv[]) #elif defined(_SC_NPROCESSORS_CONF) num_processors = sysconf(_SC_NPROCESSORS_CONF); #elif defined(CTL_HW) && defined(HW_NCPU) - int req[] = { CTL_HW, HW_NCPU }; + int req[] = {CTL_HW, HW_NCPU}; size_t len = sizeof(num_processors); sysctl(req, 2, &num_processors, &len, NULL, 0); #else @@ -1948,8 +2147,8 @@ int main(int argc, char *argv[]) thr_info = calloc(opt_n_threads + 3, sizeof(*thr)); if (!thr_info) return 1; - - thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); + + thr_hashrates = (double *)calloc(opt_n_threads, sizeof(double)); if (!thr_hashrates) return 1; @@ -1962,12 +2161,14 @@ int main(int argc, char *argv[]) return 1; /* start work I/O thread */ - if (pthread_create(&thr->pth, NULL, workio_thread, thr)) { + if (pthread_create(&thr->pth, NULL, workio_thread, thr)) + { applog(LOG_ERR, "workio thread create failed"); return 1; } - if (want_longpoll && !have_stratum) { + if (want_longpoll && !have_stratum) + { /* init longpoll thread info */ longpoll_thr_id = opt_n_threads + 1; thr = &thr_info[longpoll_thr_id]; @@ -1977,12 +2178,14 @@ int main(int argc, char *argv[]) return 1; /* start longpoll thread */ - if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) { + if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) + { applog(LOG_ERR, "longpoll thread create failed"); return 1; } } - if (want_stratum) { + if (want_stratum) + { /* init stratum thread info */ stratum_thr_id = opt_n_threads + 2; thr = &thr_info[stratum_thr_id]; @@ -1992,7 +2195,8 @@ int main(int argc, char *argv[]) return 1; /* start stratum thread */ - if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) { + if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) + { applog(LOG_ERR, "stratum thread create failed"); return 1; } @@ -2002,7 +2206,8 @@ int main(int argc, char *argv[]) } /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) { + for (i = 0; i < opt_n_threads; i++) + { thr = &thr_info[i]; thr->id = i; @@ -2010,16 +2215,17 @@ int main(int argc, char *argv[]) if (!thr->q) return 1; - if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { + if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) + { applog(LOG_ERR, "thread %d create failed", i); return 1; } } applog(LOG_INFO, "%d miner threads started, " - "using '%s' algorithm.", - opt_n_threads, - algo_names[opt_algo]); + "using '%s' algorithm.", + opt_n_threads, + algo_names[opt_algo]); /* main loop - simply wait for workio thread to exit */ pthread_join(thr_info[work_thr_id].pth, NULL); diff --git a/miner.h b/miner.h index ba9163ae2..7bbc889de 100644 --- a/miner.h +++ b/miner.h @@ -2,7 +2,6 @@ #define __MINER_H__ #include "cpuminer-config.h" - #include #include #include @@ -11,35 +10,37 @@ #include #ifdef STDC_HEADERS -# include -# include +#include +#include #else -# ifdef HAVE_STDLIB_H -# include -# endif +#ifdef HAVE_STDLIB_H +#include +#endif #endif #ifdef HAVE_ALLOCA_H -# include +#include #elif !defined alloca -# ifdef __GNUC__ -# define alloca __builtin_alloca -# elif defined _AIX -# define alloca __alloca -# elif defined _MSC_VER -# include -# define alloca _alloca -# elif !defined HAVE_ALLOCA -# ifdef __cplusplus +#ifdef __GNUC__ +#define alloca __builtin_alloca +#elif defined _AIX +#define alloca __alloca +#elif defined _MSC_VER +#include +#define alloca _alloca +#elif !defined HAVE_ALLOCA +#ifdef __cplusplus extern "C" -# endif -void *alloca (size_t); -# endif +#endif + void * + alloca(size_t); +#endif #endif #ifdef HAVE_SYSLOG_H #include #else -enum { +enum +{ LOG_ERR, LOG_WARNING, LOG_NOTICE, @@ -65,8 +66,7 @@ enum { #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP #else -#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ - | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) #endif static inline uint32_t swab32(uint32_t v) @@ -87,7 +87,7 @@ static inline uint32_t be32dec(const void *pp) { const uint8_t *p = (uint8_t const *)pp; return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); } #endif @@ -96,7 +96,7 @@ static inline uint32_t le32dec(const void *pp) { const uint8_t *p = (uint8_t const *)pp; return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); } #endif @@ -152,22 +152,30 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); #endif extern int scanhash_sha256d(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_sha256(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_sha256ET10(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); extern unsigned char *scrypt_buffer_alloc(int N); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done, int N); + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done, int N); -struct thr_info { - int id; - pthread_t pth; - struct thread_q *q; +struct thr_info +{ + int id; + pthread_t pth; + struct thread_q *q; }; -struct work_restart { - volatile unsigned long restart; - char padding[128 - sizeof(unsigned long)]; +struct work_restart +{ + volatile unsigned long restart; + char padding[128 - sizeof(unsigned long)]; }; extern bool opt_debug; @@ -190,12 +198,12 @@ extern int longpoll_thr_id; extern int stratum_thr_id; extern struct work_restart *work_restart; -#define JSON_RPC_LONGPOLL (1 << 0) -#define JSON_RPC_QUIET_404 (1 << 1) +#define JSON_RPC_LONGPOLL (1 << 0) +#define JSON_RPC_QUIET_404 (1 << 1) extern void applog(int prio, const char *fmt, ...); extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, - const char *rpc_req, int *curl_err, int flags); + const char *rpc_req, int *curl_err, int flags); void memrev(unsigned char *p, size_t len); extern void bin2hex(char *s, const unsigned char *p, size_t len); extern char *abin2hex(const unsigned char *p, size_t len); @@ -203,11 +211,12 @@ extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); extern int varint_encode(unsigned char *p, uint64_t n); extern size_t address_to_script(unsigned char *out, size_t outsz, const char *addr); extern int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y); + struct timeval *y); extern bool fulltest(const uint32_t *hash, const uint32_t *target); extern void diff_to_target(uint32_t *target, double diff); -struct stratum_job { +struct stratum_job +{ char *job_id; unsigned char prevhash[32]; size_t coinbase_size; @@ -222,7 +231,8 @@ struct stratum_job { double diff; }; -struct stratum_ctx { +struct stratum_ctx +{ char *url; CURL *curl; diff --git a/scrypt-arm.S b/scrypt-arm.S index 5be3b0e9d..459900fd6 100644 --- a/scrypt-arm.S +++ b/scrypt-arm.S @@ -27,160 +27,109 @@ #ifdef __ARM_ARCH_5E_OR_6__ -.macro scrypt_shuffle - add lr, r0, #9*4 - ldmia r0, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #5*4] - str r5, [r0, #15*4] - str r6, [r0, #12*4] - str r7, [r0, #1*4] - ldr r5, [r0, #7*4] - str r2, [r0, #13*4] - str r8, [r0, #2*4] - strd r4, [r0, #10*4] - str r9, [r0, #7*4] - str r10, [r0, #4*4] - str r11, [r0, #9*4] - str lr, [r0, #3*4] - - add r2, r0, #64+0*4 - add lr, r0, #64+9*4 - ldmia r2, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #64+5*4] - str r5, [r0, #64+15*4] - str r6, [r0, #64+12*4] - str r7, [r0, #64+1*4] - ldr r5, [r0, #64+7*4] - str r2, [r0, #64+13*4] - str r8, [r0, #64+2*4] - strd r4, [r0, #64+10*4] - str r9, [r0, #64+7*4] - str r10, [r0, #64+4*4] - str r11, [r0, #64+9*4] - str lr, [r0, #64+3*4] -.endm -.macro salsa8_core_doubleround_body - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #25 - add r6, r0, r4 - eor r11, r11, r7, ror #25 - add r7, r1, r5 - strd r10, [sp, #14*4] - eor r12, r12, r6, ror #25 - eor lr, lr, r7, ror #25 - - ldrd r6, [sp, #10*4] - add r2, r10, r2 - add r3, r11, r3 - eor r6, r6, r2, ror #23 - add r2, r12, r0 - eor r7, r7, r3, ror #23 - add r3, lr, r1 - strd r6, [sp, #10*4] - eor r8, r8, r2, ror #23 - eor r9, r9, r3, ror #23 - - ldrd r2, [sp, #6*4] - add r10, r6, r10 - add r11, r7, r11 - eor r2, r2, r10, ror #19 - add r10, r8, r12 - eor r3, r3, r11, ror #19 - add r11, r9, lr - eor r4, r4, r10, ror #19 - eor r5, r5, r11, ror #19 - - ldrd r10, [sp, #2*4] - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #14 - add r6, r4, r8 - eor r11, r11, r7, ror #14 - add r7, r5, r9 - eor r0, r0, r6, ror #14 - eor r1, r1, r7, ror #14 - + + +#else + + + + +#endif + + + + + + + + + .text + .code 32 + .align 2 + .globl scrypt_core + .globl _scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: + stmfd sp!, {r4-r11, lr} + mov r12, sp + sub sp, sp, #22*4 + bic sp, sp, #63 + str r12, [sp, #20*4] + str r2, [sp, #21*4] - ldrd r6, [sp, #14*4] - strd r2, [sp, #6*4] - strd r10, [sp, #2*4] - add r6, r11, r6 - add r7, r0, r7 - eor r4, r4, r6, ror #25 - add r6, r1, r12 - eor r5, r5, r7, ror #25 - add r7, r10, lr - eor r2, r2, r6, ror #25 - eor r3, r3, r7, ror #25 - strd r2, [sp, #6*4] - add r10, r3, r10 - ldrd r6, [sp, #10*4] - add r11, r4, r11 - eor r8, r8, r10, ror #23 - add r10, r5, r0 - eor r9, r9, r11, ror #23 - add r11, r2, r1 - eor r6, r6, r10, ror #23 - eor r7, r7, r11, ror #23 - strd r6, [sp, #10*4] + ldr r2, [sp, #21*4] + str r0, [sp, #16*4] + add r12, r1, r2, lsl #7 + str r12, [sp, #18*4] +scrypt_core_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + str r1, [sp, #17*4] - add r2, r7, r2 - ldrd r10, [sp, #14*4] - add r3, r8, r3 - eor r12, r12, r2, ror #19 - add r2, r9, r4 - eor lr, lr, r3, ror #19 - add r3, r6, r5 - eor r10, r10, r2, ror #19 - eor r11, r11, r3, ror #19 + ldmia sp, {r0-r7} - ldrd r2, [sp, #2*4] - add r6, r11, r6 - add r7, r12, r7 - eor r0, r0, r6, ror #14 - add r6, lr, r8 - eor r1, r1, r7, ror #14 - add r7, r10, r9 - eor r2, r2, r6, ror #14 - eor r3, r3, r7, ror #14 -.endm - -.macro salsa8_core - ldmia sp, {r0-r12, lr} + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] - ldrd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] - strd r10, [sp, #14*4] -.endm - -#else - -.macro scrypt_shuffle -.endm - -.macro salsa8_core_doubleround_body ldr r8, [sp, #8*4] add r11, r11, r10 ldr lr, [sp, #13*4] @@ -257,28 +206,6 @@ eor r3, r3, r11, ror #19 str lr, [sp, #13*4] eor r4, r4, r12, ror #19 -.endm - -.macro salsa8_core - ldmia sp, {r0-r7} - - ldr r12, [sp, #15*4] - ldr r8, [sp, #11*4] - ldr lr, [sp, #12*4] - - ldr r9, [sp, #9*4] - add r8, r8, r12 - ldr r11, [sp, #10*4] - add lr, lr, r0 - eor r3, r3, r8, ror #25 - add r8, r5, r1 - ldr r10, [sp, #14*4] - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body ldr r11, [sp, #10*4] add r8, r9, r8 @@ -306,40 +233,190 @@ str r10, [sp, #14*4] eor r10, r10, lr, ror #25 - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 + eor r8, r8, r11, ror #23 str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body + eor lr, lr, r12, ror #23 - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr eor r11, r11, r8, ror #14 add r8, r3, r2 eor r12, r12, lr, ror #14 @@ -362,7 +439,82 @@ str r10, [sp, #14*4] eor r10, r10, lr, ror #25 - salsa8_core_doubleround_body + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 ldr r11, [sp, #10*4] add r8, r9, r8 @@ -380,47 +532,38 @@ eor r5, r5, lr, ror #14 stmia sp, {r0-r7} -.endm - -#endif - - -.macro scrypt_core_macro1a_x4 - ldmia r0, {r4-r7} - ldmia lr!, {r8-r11} - stmia r1!, {r4-r7} - stmia r3!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro1b_x4 - ldmia r3!, {r8-r11} - ldmia r2, {r4-r7} - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - ldmia r0, {r4-r7} - stmia r2!, {r8-r11} + ldmia r2, {r8-r11} eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - ldmia r1!, {r8-r11} + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - stmia r0!, {r4-r7} + stmia r2!, {r4-r7} stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro2_x4 ldmia r12, {r4-r7} ldmia r0, {r8-r11} add r4, r4, r8 @@ -435,147 +578,1518 @@ eor r7, r7, r11 stmia r2!, {r4-r7} stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x4 - ldmia r1!, {r4-r7} + ldmia r12, {r4-r7} ldmia r0, {r8-r11} add r4, r4, r8 add r5, r5, r9 add r6, r6, r10 add r7, r7, r11 stmia r0!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x6 - ldmia r1!, {r2-r7} - ldmia r0, {r8-r12, lr} - add r2, r2, r8 - add r3, r3, r9 - add r4, r4, r10 - add r5, r5, r11 - add r6, r6, r12 - add r7, r7, lr - stmia r0!, {r2-r7} -.endm - - - .text - .code 32 - .align 2 - .globl scrypt_core - .globl _scrypt_core -#ifdef __ELF__ - .type scrypt_core, %function -#endif -scrypt_core: -_scrypt_core: - stmfd sp!, {r4-r11, lr} - mov r12, sp - sub sp, sp, #22*4 - bic sp, sp, #63 - str r12, [sp, #20*4] - str r2, [sp, #21*4] - - scrypt_shuffle + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} - ldr r2, [sp, #21*4] - str r0, [sp, #16*4] - add r12, r1, r2, lsl #7 - str r12, [sp, #18*4] -scrypt_core_loop1: - add lr, r0, #16*4 - add r3, r1, #16*4 - mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - str r1, [sp, #17*4] + ldmia sp, {r0-r7} - salsa8_core + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] - ldr r0, [sp, #16*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 - salsa8_core + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 - ldr r0, [sp, #16*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - ldr r3, [sp, #17*4] - ldr r12, [sp, #18*4] - scrypt_core_macro3_x4 + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 - add r1, r3, #16*4 - sub r0, r0, #32*4 - cmp r1, r12 - bne scrypt_core_loop1 + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] - ldr r12, [sp, #21*4] - ldr r4, [r0, #16*4] - sub r2, r12, #1 - str r2, [sp, #21*4] - sub r1, r1, r12, lsl #7 - str r1, [sp, #17*4] - and r4, r4, r2 - add r1, r1, r4, lsl #7 -scrypt_core_loop2: - add r2, r0, #16*4 - add r3, r1, #16*4 - str r12, [sp, #18*4] - mov r12, sp -#ifdef __ARM_ARCH_5E_OR_6_OR_7__ - pld [r1, #24*4] - pld [r1, #8*4] -#endif - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - salsa8_core + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 - ldr r0, [sp, #16*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 - salsa8_core + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 - ldr r0, [sp, #16*4] - mov r1, sp - ldr r3, [sp, #17*4] - add r0, r0, #16*4 - ldr r2, [sp, #21*4] - scrypt_core_macro3_x4 - and r4, r4, r2 - add r3, r3, r4, lsl #7 - str r3, [sp, #19*4] -#ifdef __ARM_ARCH_5E_OR_6_OR_7__ - pld [r3, #16*4] - pld [r3] -#endif - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] - ldr r12, [sp, #18*4] - sub r0, r0, #32*4 - ldr r1, [sp, #19*4] - subs r12, r12, #1 - bne scrypt_core_loop2 + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + str r9, [sp, #9*4] + eor r11, r11, r8, ror #14 + eor r12, r12, lr, ror #14 + add r8, r3, r2 + str r10, [sp, #14*4] + add lr, r4, r7 + str r11, [sp, #10*4] + eor r0, r0, r8, ror #14 + str r12, [sp, #15*4] + eor r5, r5, lr, ror #14 + + stmia sp, {r0-r7} + + ldr r0, [sp, #16*4] + mov r1, sp + add r0, r0, #16*4 + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + ldr r3, [sp, #17*4] + ldr r12, [sp, #18*4] + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + + add r1, r3, #16*4 + sub r0, r0, #32*4 + cmp r1, r12 + bne scrypt_core_loop1 + + ldr r12, [sp, #21*4] + ldr r4, [r0, #16*4] + sub r2, r12, #1 + str r2, [sp, #21*4] + sub r1, r1, r12, lsl #7 + str r1, [sp, #17*4] + and r4, r4, r2 + add r1, r1, r4, lsl #7 +scrypt_core_loop2: + add r2, r0, #16*4 + add r3, r1, #16*4 + str r12, [sp, #18*4] + mov r12, sp +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r1, #24*4] + pld [r1, #8*4] +#endif + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + + ldmia sp, {r0-r7} + + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] + + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + str r9, [sp, #9*4] + eor r11, r11, r8, ror #14 + eor r12, r12, lr, ror #14 + add r8, r3, r2 + str r10, [sp, #14*4] + add lr, r4, r7 + str r11, [sp, #10*4] + eor r0, r0, r8, ror #14 + str r12, [sp, #15*4] + eor r5, r5, lr, ror #14 + + stmia sp, {r0-r7} + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + + ldmia sp, {r0-r7} + + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] + + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + str r9, [sp, #9*4] + eor r11, r11, r8, ror #14 + eor r12, r12, lr, ror #14 + add r8, r3, r2 + str r10, [sp, #14*4] + add lr, r4, r7 + str r11, [sp, #10*4] + eor r0, r0, r8, ror #14 + str r12, [sp, #15*4] + eor r5, r5, lr, ror #14 + + stmia sp, {r0-r7} + + ldr r0, [sp, #16*4] + mov r1, sp + ldr r3, [sp, #17*4] + add r0, r0, #16*4 + ldr r2, [sp, #21*4] + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + and r4, r4, r2 + add r3, r3, r4, lsl #7 + str r3, [sp, #19*4] +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r3, #16*4] + pld [r3] +#endif + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + + ldr r12, [sp, #18*4] + sub r0, r0, #32*4 + ldr r1, [sp, #19*4] + subs r12, r12, #1 + bne scrypt_core_loop2 - scrypt_shuffle ldr sp, [sp, #20*4] #ifdef __thumb__ @@ -588,7 +2102,3081 @@ scrypt_core_loop2: #ifdef __ARM_NEON__ -.macro salsa8_core_3way_doubleround + + + .text + .code 32 + .align 2 + .globl scrypt_core_3way + .globl _scrypt_core_3way +#ifdef __ELF__ + .type scrypt_core_3way, %function +#endif +scrypt_core_3way: +_scrypt_core_3way: + stmfd sp!, {r4-r11, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #24*16 + bic sp, sp, #63 + str r2, [sp, #4*16+3*4] + str r12, [sp, #4*16+4*4] + + mov r3, r0 + vldmia r3!, {q8-q15} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vldmia r3!, {q0-q7} + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0, {q8-q15} + vmov.u64 q8, #0xffffffff + vmov.u32 q9, q0 + vmov.u32 q10, q4 + vbif.u32 q0, q1, q8 + vbif.u32 q4, q5, q8 + vbif.u32 q1, q2, q8 + vbif.u32 q5, q6, q8 + vbif.u32 q2, q3, q8 + vbif.u32 q6, q7, q8 + vbif.u32 q3, q9, q8 + vbif.u32 q7, q10, q8 + vldmia r3, {q8-q15} + vswp.u32 d1, d5 + vswp.u32 d9, d13 + vswp.u32 d2, d6 + vswp.u32 d10, d14 + add r12, sp, #8*16 + vstmia r12!, {q0-q7} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r12, {q8-q15} + + add lr, sp, #128 + vldmia lr, {q0-q7} + add r2, r1, r2, lsl #7 + str r0, [sp, #4*16+0*4] + str r2, [sp, #4*16+2*4] +scrypt_core_3way_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + str r1, [sp, #4*16+1*4] + mov r12, sp + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldr r2, [sp, #4*16+3*4] + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + sub r1, r1, #4*16 + + add r1, r1, r2, lsl #7 + vstmia r1, {q0-q7} + add r3, r1, r2, lsl #7 + vstmia r3, {q8-q15} + + add lr, sp, #128 + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia lr, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + add r12, sp, #256 + vstmia r12, {q8-q11} + + ldmia sp, {r0-r12, lr} + ldrd r10, [sp, #14*4] + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] + + ldr r0, [sp, #4*16+0*4] + mov r12, sp + add r2, r0, #16*4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + + add lr, sp, #128 + vldmia lr, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + add r12, sp, #256 + vldmia r12, {q0-q3} + vstmia lr, {q4-q7} + vadd.u32 q8, q8, q0 + vadd.u32 q9, q9, q1 + vadd.u32 q10, q10, q2 + vadd.u32 q11, q11, q3 + + add r4, sp, #128+4*16 + vldmia r4, {q0-q3} + vstmia r12, {q8-q11} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia r4, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + + ldmia sp, {r0-r12, lr} + ldrd r10, [sp, #14*4] + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] + + ldr r0, [sp, #4*16+0*4] + mov r1, sp + add r0, r0, #16*4 + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + sub r0, r0, #8*16 + + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+2*4] + add lr, sp, #128 + add r4, sp, #128+4*16 + vldmia r4, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + vstmia r4, {q4-q7} + vldmia lr, {q0-q3} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + add r12, sp, #256 + vldmia r12, {q8-q11} + + add r1, r1, #8*16 + cmp r1, r2 + bne scrypt_core_3way_loop1 + + ldr r2, [sp, #4*16+3*4] + add r5, sp, #256+4*16 + vstmia r5, {q12-q15} + + sub r1, r1, r2, lsl #7 + str r1, [sp, #4*16+1*4] +scrypt_core_3way_loop2: + str r2, [sp, #4*16+2*4] + + ldr r0, [sp, #4*16+0*4] + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] + ldr r4, [r0, #16*4] + sub r2, r2, #1 + and r4, r4, r2 + add r1, r1, r4, lsl #7 + add r2, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} + + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] + add r1, r1, r2, lsl #7 + add r3, r1, r2, lsl #7 + sub r2, r2, #1 + vmov r6, r7, d8 + and r6, r6, r2 + add r6, r1, r6, lsl #7 + vmov r7, r8, d24 + add lr, sp, #128 + vldmia lr, {q0-q3} + pld [r6] + pld [r6, #8*4] + pld [r6, #16*4] + pld [r6, #24*4] + vldmia r6, {q8-q15} + and r7, r7, r2 + add r7, r3, r7, lsl #7 + veor.u32 q8, q8, q0 + veor.u32 q9, q9, q1 + veor.u32 q10, q10, q2 + veor.u32 q11, q11, q3 + pld [r7] + pld [r7, #8*4] + pld [r7, #16*4] + pld [r7, #24*4] + veor.u32 q12, q12, q4 + veor.u32 q13, q13, q5 + veor.u32 q14, q14, q6 + veor.u32 q15, q15, q7 + vldmia r7, {q0-q7} + vstmia lr, {q8-q15} + add r12, sp, #256 + vldmia r12, {q8-q15} + veor.u32 q8, q8, q0 + veor.u32 q9, q9, q1 + veor.u32 q10, q10, q2 + veor.u32 q11, q11, q3 + veor.u32 q12, q12, q4 + veor.u32 q13, q13, q5 + veor.u32 q14, q14, q6 + veor.u32 q15, q15, q7 + + vldmia lr, {q0-q7} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia lr, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vstmia r12, {q8-q15} + + ldmia sp, {r0-r12, lr} + ldrd r10, [sp, #14*4] + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] + + ldr r0, [sp, #4*16+0*4] + mov r12, sp + add r2, r0, #16*4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} + + add lr, sp, #128 + vldmia lr, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + add r12, sp, #256 + vldmia r12, {q12-q15} + vstmia lr, {q4-q7} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + + add r4, sp, #128+4*16 + vldmia r4, {q0-q3} + vstmia r12, {q12-q15} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + add r5, sp, #256+4*16 + vldmia r5, {q8-q11} + vstmia r4, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + + ldmia sp, {r0-r12, lr} + ldrd r10, [sp, #14*4] + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 ldrd r6, [sp, #6*4] vadd.u32 q4, q0, q1 add r6, r2, r6 @@ -769,324 +5357,190 @@ scrypt_core_loop2: veor.u32 q8, q8, q7 veor.u32 q0, q0, q4 veor.u32 q8, q8, q6 -.endm - -.macro salsa8_core_3way - ldmia sp, {r0-r12, lr} - ldrd r10, [sp, #14*4] - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] -.endm - - .text - .code 32 - .align 2 - .globl scrypt_core_3way - .globl _scrypt_core_3way -#ifdef __ELF__ - .type scrypt_core_3way, %function -#endif -scrypt_core_3way: -_scrypt_core_3way: - stmfd sp!, {r4-r11, lr} - vpush {q4-q7} - mov r12, sp - sub sp, sp, #24*16 - bic sp, sp, #63 - str r2, [sp, #4*16+3*4] - str r12, [sp, #4*16+4*4] - - mov r3, r0 - vldmia r3!, {q8-q15} - vmov.u64 q0, #0xffffffff - vmov.u32 q1, q8 - vmov.u32 q2, q12 - vbif.u32 q8, q9, q0 - vbif.u32 q12, q13, q0 - vbif.u32 q9, q10, q0 - vbif.u32 q13, q14, q0 - vbif.u32 q10, q11, q0 - vbif.u32 q14, q15, q0 - vbif.u32 q11, q1, q0 - vbif.u32 q15, q2, q0 - vldmia r3!, {q0-q7} - vswp.u32 d17, d21 - vswp.u32 d25, d29 - vswp.u32 d18, d22 - vswp.u32 d26, d30 - vstmia r0, {q8-q15} - vmov.u64 q8, #0xffffffff - vmov.u32 q9, q0 - vmov.u32 q10, q4 - vbif.u32 q0, q1, q8 - vbif.u32 q4, q5, q8 - vbif.u32 q1, q2, q8 - vbif.u32 q5, q6, q8 - vbif.u32 q2, q3, q8 - vbif.u32 q6, q7, q8 - vbif.u32 q3, q9, q8 - vbif.u32 q7, q10, q8 - vldmia r3, {q8-q15} - vswp.u32 d1, d5 - vswp.u32 d9, d13 - vswp.u32 d2, d6 - vswp.u32 d10, d14 - add r12, sp, #8*16 - vstmia r12!, {q0-q7} - vmov.u64 q0, #0xffffffff - vmov.u32 q1, q8 - vmov.u32 q2, q12 - vbif.u32 q8, q9, q0 - vbif.u32 q12, q13, q0 - vbif.u32 q9, q10, q0 - vbif.u32 q13, q14, q0 - vbif.u32 q10, q11, q0 - vbif.u32 q14, q15, q0 - vbif.u32 q11, q1, q0 - vbif.u32 q15, q2, q0 - vswp.u32 d17, d21 - vswp.u32 d25, d29 - vswp.u32 d18, d22 - vswp.u32 d26, d30 - vstmia r12, {q8-q15} - - add lr, sp, #128 - vldmia lr, {q0-q7} - add r2, r1, r2, lsl #7 - str r0, [sp, #4*16+0*4] - str r2, [sp, #4*16+2*4] -scrypt_core_3way_loop1: - add lr, r0, #16*4 - add r3, r1, #16*4 - str r1, [sp, #4*16+1*4] - mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - ldr r2, [sp, #4*16+3*4] - scrypt_core_macro1a_x4 - sub r1, r1, #4*16 - - add r1, r1, r2, lsl #7 - vstmia r1, {q0-q7} - add r3, r1, r2, lsl #7 - vstmia r3, {q8-q15} - - add lr, sp, #128 - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - vstmia lr, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - add r12, sp, #256 - vstmia r12, {q8-q11} - - salsa8_core_3way - - ldr r0, [sp, #4*16+0*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - add lr, sp, #128 - vldmia lr, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - add r12, sp, #256 - vldmia r12, {q0-q3} - vstmia lr, {q4-q7} - vadd.u32 q8, q8, q0 - vadd.u32 q9, q9, q1 - vadd.u32 q10, q10, q2 - vadd.u32 q11, q11, q3 - - add r4, sp, #128+4*16 - vldmia r4, {q0-q3} - vstmia r12, {q8-q11} - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - vstmia r4, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - vmov q12, q8 - vmov q13, q9 - vmov q14, q10 - vmov q15, q11 - - salsa8_core_3way - - ldr r0, [sp, #4*16+0*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - scrypt_core_macro3_x4 - sub r0, r0, #8*16 - - ldr r1, [sp, #4*16+1*4] - ldr r2, [sp, #4*16+2*4] - add lr, sp, #128 - add r4, sp, #128+4*16 - vldmia r4, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - vstmia r4, {q4-q7} - vldmia lr, {q0-q3} - vadd.u32 q12, q12, q8 - vadd.u32 q13, q13, q9 - vadd.u32 q14, q14, q10 - vadd.u32 q15, q15, q11 - add r12, sp, #256 - vldmia r12, {q8-q11} - - add r1, r1, #8*16 - cmp r1, r2 - bne scrypt_core_3way_loop1 - - ldr r2, [sp, #4*16+3*4] - add r5, sp, #256+4*16 - vstmia r5, {q12-q15} - - sub r1, r1, r2, lsl #7 - str r1, [sp, #4*16+1*4] -scrypt_core_3way_loop2: - str r2, [sp, #4*16+2*4] + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 - ldr r0, [sp, #4*16+0*4] - ldr r1, [sp, #4*16+1*4] - ldr r2, [sp, #4*16+3*4] - ldr r4, [r0, #16*4] - sub r2, r2, #1 - and r4, r4, r2 - add r1, r1, r4, lsl #7 - add r2, r0, #16*4 - add r3, r1, #16*4 - mov r12, sp - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 - ldr r1, [sp, #4*16+1*4] - ldr r2, [sp, #4*16+3*4] - add r1, r1, r2, lsl #7 - add r3, r1, r2, lsl #7 - sub r2, r2, #1 - vmov r6, r7, d8 - and r6, r6, r2 - add r6, r1, r6, lsl #7 - vmov r7, r8, d24 - add lr, sp, #128 - vldmia lr, {q0-q3} - pld [r6] - pld [r6, #8*4] - pld [r6, #16*4] - pld [r6, #24*4] - vldmia r6, {q8-q15} - and r7, r7, r2 - add r7, r3, r7, lsl #7 - veor.u32 q8, q8, q0 - veor.u32 q9, q9, q1 - veor.u32 q10, q10, q2 - veor.u32 q11, q11, q3 - pld [r7] - pld [r7, #8*4] - pld [r7, #16*4] - pld [r7, #24*4] - veor.u32 q12, q12, q4 - veor.u32 q13, q13, q5 - veor.u32 q14, q14, q6 - veor.u32 q15, q15, q7 - vldmia r7, {q0-q7} - vstmia lr, {q8-q15} - add r12, sp, #256 - vldmia r12, {q8-q15} - veor.u32 q8, q8, q0 - veor.u32 q9, q9, q1 - veor.u32 q10, q10, q2 - veor.u32 q11, q11, q3 - veor.u32 q12, q12, q4 - veor.u32 q13, q13, q5 - veor.u32 q14, q14, q6 - veor.u32 q15, q15, q7 + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 - vldmia lr, {q0-q7} + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - vstmia lr, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - vstmia r12, {q8-q15} + veor.u32 q8, q8, q6 - salsa8_core_3way - ldr r0, [sp, #4*16+0*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 - add lr, sp, #128 - vldmia lr, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - add r12, sp, #256 - vldmia r12, {q12-q15} - vstmia lr, {q4-q7} - vadd.u32 q12, q12, q8 - vadd.u32 q13, q13, q9 - vadd.u32 q14, q14, q10 - vadd.u32 q15, q15, q11 + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 - add r4, sp, #128+4*16 - vldmia r4, {q0-q3} - vstmia r12, {q12-q15} - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - add r5, sp, #256+4*16 - vldmia r5, {q8-q11} - vstmia r4, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - vmov q12, q8 - vmov q13, q9 - vmov q14, q10 - vmov q15, q11 + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 - salsa8_core_3way + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] ldr r0, [sp, #4*16+0*4] ldr r3, [sp, #4*16+1*4] @@ -1094,15 +5548,37 @@ scrypt_core_3way_loop2: mov r1, sp add r0, r0, #16*4 sub r2, r2, #1 - scrypt_core_macro3_x4 + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} and r4, r4, r2 add r3, r3, r4, lsl #7 pld [r3, #16*4] pld [r3] pld [r3, #24*4] pld [r3, #8*4] - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} add lr, sp, #128 add r4, sp, #128+4*16 diff --git a/scrypt-arm.S.orig b/scrypt-arm.S.orig new file mode 100644 index 000000000..5be3b0e9d --- /dev/null +++ b/scrypt-arm.S.orig @@ -0,0 +1,1186 @@ +/* + * Copyright 2012, 2014 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) + +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) +#define __ARM_ARCH_5E_OR_6__ +#endif + +#if defined(__ARM_ARCH_5E_OR_6__) || defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) +#define __ARM_ARCH_5E_OR_6_OR_7__ +#endif + +#ifdef __ARM_ARCH_5E_OR_6__ + +.macro scrypt_shuffle + add lr, r0, #9*4 + ldmia r0, {r2-r7} + ldmia lr, {r2, r8-r12, lr} + str r3, [r0, #5*4] + str r5, [r0, #15*4] + str r6, [r0, #12*4] + str r7, [r0, #1*4] + ldr r5, [r0, #7*4] + str r2, [r0, #13*4] + str r8, [r0, #2*4] + strd r4, [r0, #10*4] + str r9, [r0, #7*4] + str r10, [r0, #4*4] + str r11, [r0, #9*4] + str lr, [r0, #3*4] + + add r2, r0, #64+0*4 + add lr, r0, #64+9*4 + ldmia r2, {r2-r7} + ldmia lr, {r2, r8-r12, lr} + str r3, [r0, #64+5*4] + str r5, [r0, #64+15*4] + str r6, [r0, #64+12*4] + str r7, [r0, #64+1*4] + ldr r5, [r0, #64+7*4] + str r2, [r0, #64+13*4] + str r8, [r0, #64+2*4] + strd r4, [r0, #64+10*4] + str r9, [r0, #64+7*4] + str r10, [r0, #64+4*4] + str r11, [r0, #64+9*4] + str lr, [r0, #64+3*4] +.endm + +.macro salsa8_core_doubleround_body + add r6, r2, r6 + add r7, r3, r7 + eor r10, r10, r6, ror #25 + add r6, r0, r4 + eor r11, r11, r7, ror #25 + add r7, r1, r5 + strd r10, [sp, #14*4] + eor r12, r12, r6, ror #25 + eor lr, lr, r7, ror #25 + + ldrd r6, [sp, #10*4] + add r2, r10, r2 + add r3, r11, r3 + eor r6, r6, r2, ror #23 + add r2, r12, r0 + eor r7, r7, r3, ror #23 + add r3, lr, r1 + strd r6, [sp, #10*4] + eor r8, r8, r2, ror #23 + eor r9, r9, r3, ror #23 + + ldrd r2, [sp, #6*4] + add r10, r6, r10 + add r11, r7, r11 + eor r2, r2, r10, ror #19 + add r10, r8, r12 + eor r3, r3, r11, ror #19 + add r11, r9, lr + eor r4, r4, r10, ror #19 + eor r5, r5, r11, ror #19 + + ldrd r10, [sp, #2*4] + add r6, r2, r6 + add r7, r3, r7 + eor r10, r10, r6, ror #14 + add r6, r4, r8 + eor r11, r11, r7, ror #14 + add r7, r5, r9 + eor r0, r0, r6, ror #14 + eor r1, r1, r7, ror #14 + + + ldrd r6, [sp, #14*4] + strd r2, [sp, #6*4] + strd r10, [sp, #2*4] + add r6, r11, r6 + add r7, r0, r7 + eor r4, r4, r6, ror #25 + add r6, r1, r12 + eor r5, r5, r7, ror #25 + add r7, r10, lr + eor r2, r2, r6, ror #25 + eor r3, r3, r7, ror #25 + strd r2, [sp, #6*4] + + add r10, r3, r10 + ldrd r6, [sp, #10*4] + add r11, r4, r11 + eor r8, r8, r10, ror #23 + add r10, r5, r0 + eor r9, r9, r11, ror #23 + add r11, r2, r1 + eor r6, r6, r10, ror #23 + eor r7, r7, r11, ror #23 + strd r6, [sp, #10*4] + + add r2, r7, r2 + ldrd r10, [sp, #14*4] + add r3, r8, r3 + eor r12, r12, r2, ror #19 + add r2, r9, r4 + eor lr, lr, r3, ror #19 + add r3, r6, r5 + eor r10, r10, r2, ror #19 + eor r11, r11, r3, ror #19 + + ldrd r2, [sp, #2*4] + add r6, r11, r6 + add r7, r12, r7 + eor r0, r0, r6, ror #14 + add r6, lr, r8 + eor r1, r1, r7, ror #14 + add r7, r10, r9 + eor r2, r2, r6, ror #14 + eor r3, r3, r7, ror #14 +.endm + +.macro salsa8_core + ldmia sp, {r0-r12, lr} + + ldrd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] + strd r10, [sp, #14*4] +.endm + +#else + +.macro scrypt_shuffle +.endm + +.macro salsa8_core_doubleround_body + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 +.endm + +.macro salsa8_core + ldmia sp, {r0-r7} + + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] + + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + str r9, [sp, #9*4] + eor r11, r11, r8, ror #14 + eor r12, r12, lr, ror #14 + add r8, r3, r2 + str r10, [sp, #14*4] + add lr, r4, r7 + str r11, [sp, #10*4] + eor r0, r0, r8, ror #14 + str r12, [sp, #15*4] + eor r5, r5, lr, ror #14 + + stmia sp, {r0-r7} +.endm + +#endif + + +.macro scrypt_core_macro1a_x4 + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro1b_x4 + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro2_x4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro3_x4 + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} +.endm + +.macro scrypt_core_macro3_x6 + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} +.endm + + + .text + .code 32 + .align 2 + .globl scrypt_core + .globl _scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: + stmfd sp!, {r4-r11, lr} + mov r12, sp + sub sp, sp, #22*4 + bic sp, sp, #63 + str r12, [sp, #20*4] + str r2, [sp, #21*4] + + scrypt_shuffle + + ldr r2, [sp, #21*4] + str r0, [sp, #16*4] + add r12, r1, r2, lsl #7 + str r12, [sp, #18*4] +scrypt_core_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + str r1, [sp, #17*4] + + salsa8_core + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r1, sp + add r0, r0, #16*4 + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + ldr r3, [sp, #17*4] + ldr r12, [sp, #18*4] + scrypt_core_macro3_x4 + + add r1, r3, #16*4 + sub r0, r0, #32*4 + cmp r1, r12 + bne scrypt_core_loop1 + + ldr r12, [sp, #21*4] + ldr r4, [r0, #16*4] + sub r2, r12, #1 + str r2, [sp, #21*4] + sub r1, r1, r12, lsl #7 + str r1, [sp, #17*4] + and r4, r4, r2 + add r1, r1, r4, lsl #7 +scrypt_core_loop2: + add r2, r0, #16*4 + add r3, r1, #16*4 + str r12, [sp, #18*4] + mov r12, sp +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r1, #24*4] + pld [r1, #8*4] +#endif + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r1, sp + ldr r3, [sp, #17*4] + add r0, r0, #16*4 + ldr r2, [sp, #21*4] + scrypt_core_macro3_x4 + and r4, r4, r2 + add r3, r3, r4, lsl #7 + str r3, [sp, #19*4] +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r3, #16*4] + pld [r3] +#endif + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + + ldr r12, [sp, #18*4] + sub r0, r0, #32*4 + ldr r1, [sp, #19*4] + subs r12, r12, #1 + bne scrypt_core_loop2 + + scrypt_shuffle + + ldr sp, [sp, #20*4] +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + +#ifdef __ARM_NEON__ + +.macro salsa8_core_3way_doubleround + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 +.endm + +.macro salsa8_core_3way + ldmia sp, {r0-r12, lr} + ldrd r10, [sp, #14*4] + salsa8_core_3way_doubleround + salsa8_core_3way_doubleround + salsa8_core_3way_doubleround + salsa8_core_3way_doubleround + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] +.endm + + .text + .code 32 + .align 2 + .globl scrypt_core_3way + .globl _scrypt_core_3way +#ifdef __ELF__ + .type scrypt_core_3way, %function +#endif +scrypt_core_3way: +_scrypt_core_3way: + stmfd sp!, {r4-r11, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #24*16 + bic sp, sp, #63 + str r2, [sp, #4*16+3*4] + str r12, [sp, #4*16+4*4] + + mov r3, r0 + vldmia r3!, {q8-q15} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vldmia r3!, {q0-q7} + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0, {q8-q15} + vmov.u64 q8, #0xffffffff + vmov.u32 q9, q0 + vmov.u32 q10, q4 + vbif.u32 q0, q1, q8 + vbif.u32 q4, q5, q8 + vbif.u32 q1, q2, q8 + vbif.u32 q5, q6, q8 + vbif.u32 q2, q3, q8 + vbif.u32 q6, q7, q8 + vbif.u32 q3, q9, q8 + vbif.u32 q7, q10, q8 + vldmia r3, {q8-q15} + vswp.u32 d1, d5 + vswp.u32 d9, d13 + vswp.u32 d2, d6 + vswp.u32 d10, d14 + add r12, sp, #8*16 + vstmia r12!, {q0-q7} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r12, {q8-q15} + + add lr, sp, #128 + vldmia lr, {q0-q7} + add r2, r1, r2, lsl #7 + str r0, [sp, #4*16+0*4] + str r2, [sp, #4*16+2*4] +scrypt_core_3way_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + str r1, [sp, #4*16+1*4] + mov r12, sp + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + ldr r2, [sp, #4*16+3*4] + scrypt_core_macro1a_x4 + sub r1, r1, #4*16 + + add r1, r1, r2, lsl #7 + vstmia r1, {q0-q7} + add r3, r1, r2, lsl #7 + vstmia r3, {q8-q15} + + add lr, sp, #128 + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia lr, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + add r12, sp, #256 + vstmia r12, {q8-q11} + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + add lr, sp, #128 + vldmia lr, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + add r12, sp, #256 + vldmia r12, {q0-q3} + vstmia lr, {q4-q7} + vadd.u32 q8, q8, q0 + vadd.u32 q9, q9, q1 + vadd.u32 q10, q10, q2 + vadd.u32 q11, q11, q3 + + add r4, sp, #128+4*16 + vldmia r4, {q0-q3} + vstmia r12, {q8-q11} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia r4, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + mov r1, sp + add r0, r0, #16*4 + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + scrypt_core_macro3_x4 + sub r0, r0, #8*16 + + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+2*4] + add lr, sp, #128 + add r4, sp, #128+4*16 + vldmia r4, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + vstmia r4, {q4-q7} + vldmia lr, {q0-q3} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + add r12, sp, #256 + vldmia r12, {q8-q11} + + add r1, r1, #8*16 + cmp r1, r2 + bne scrypt_core_3way_loop1 + + ldr r2, [sp, #4*16+3*4] + add r5, sp, #256+4*16 + vstmia r5, {q12-q15} + + sub r1, r1, r2, lsl #7 + str r1, [sp, #4*16+1*4] +scrypt_core_3way_loop2: + str r2, [sp, #4*16+2*4] + + ldr r0, [sp, #4*16+0*4] + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] + ldr r4, [r0, #16*4] + sub r2, r2, #1 + and r4, r4, r2 + add r1, r1, r4, lsl #7 + add r2, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] + add r1, r1, r2, lsl #7 + add r3, r1, r2, lsl #7 + sub r2, r2, #1 + vmov r6, r7, d8 + and r6, r6, r2 + add r6, r1, r6, lsl #7 + vmov r7, r8, d24 + add lr, sp, #128 + vldmia lr, {q0-q3} + pld [r6] + pld [r6, #8*4] + pld [r6, #16*4] + pld [r6, #24*4] + vldmia r6, {q8-q15} + and r7, r7, r2 + add r7, r3, r7, lsl #7 + veor.u32 q8, q8, q0 + veor.u32 q9, q9, q1 + veor.u32 q10, q10, q2 + veor.u32 q11, q11, q3 + pld [r7] + pld [r7, #8*4] + pld [r7, #16*4] + pld [r7, #24*4] + veor.u32 q12, q12, q4 + veor.u32 q13, q13, q5 + veor.u32 q14, q14, q6 + veor.u32 q15, q15, q7 + vldmia r7, {q0-q7} + vstmia lr, {q8-q15} + add r12, sp, #256 + vldmia r12, {q8-q15} + veor.u32 q8, q8, q0 + veor.u32 q9, q9, q1 + veor.u32 q10, q10, q2 + veor.u32 q11, q11, q3 + veor.u32 q12, q12, q4 + veor.u32 q13, q13, q5 + veor.u32 q14, q14, q6 + veor.u32 q15, q15, q7 + + vldmia lr, {q0-q7} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia lr, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vstmia r12, {q8-q15} + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + add lr, sp, #128 + vldmia lr, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + add r12, sp, #256 + vldmia r12, {q12-q15} + vstmia lr, {q4-q7} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + + add r4, sp, #128+4*16 + vldmia r4, {q0-q3} + vstmia r12, {q12-q15} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + add r5, sp, #256+4*16 + vldmia r5, {q8-q11} + vstmia r4, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + ldr r3, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] + mov r1, sp + add r0, r0, #16*4 + sub r2, r2, #1 + scrypt_core_macro3_x4 + and r4, r4, r2 + add r3, r3, r4, lsl #7 + pld [r3, #16*4] + pld [r3] + pld [r3, #24*4] + pld [r3, #8*4] + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + + add lr, sp, #128 + add r4, sp, #128+4*16 + vldmia r4, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + vstmia r4, {q4-q7} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + add r5, sp, #256+4*16 + vstmia r5, {q12-q15} + + ldr r2, [sp, #4*16+2*4] + subs r2, r2, #1 + bne scrypt_core_3way_loop2 + + ldr r0, [sp, #4*16+0*4] + vldmia r0, {q8-q15} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + add r12, sp, #8*16 + vldmia r12!, {q0-q7} + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0!, {q8-q15} + vmov.u64 q8, #0xffffffff + vmov.u32 q9, q0 + vmov.u32 q10, q4 + vbif.u32 q0, q1, q8 + vbif.u32 q4, q5, q8 + vbif.u32 q1, q2, q8 + vbif.u32 q5, q6, q8 + vbif.u32 q2, q3, q8 + vbif.u32 q6, q7, q8 + vbif.u32 q3, q9, q8 + vbif.u32 q7, q10, q8 + vldmia r12, {q8-q15} + vswp.u32 d1, d5 + vswp.u32 d9, d13 + vswp.u32 d2, d6 + vswp.u32 d10, d14 + vstmia r0!, {q0-q7} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0, {q8-q15} + + ldr sp, [sp, #4*16+4*4] + vpop {q4-q7} + ldmfd sp!, {r4-r11, pc} + +#endif /* __ARM_NEON__ */ + +#endif diff --git a/scrypt-ppc.S b/scrypt-ppc.S index 47ef643ec..78834d2fb 100644 --- a/scrypt-ppc.S +++ b/scrypt-ppc.S @@ -98,52 +98,7 @@ .machine ppc7400 #endif -.macro salsa8_core_doubleround - vadduwm v4, v0, v1 - vrlw v4, v4, v16 - vxor v3, v3, v4 - - vadduwm v4, v3, v0 - vrlw v4, v4, v17 - vxor v2, v2, v4 - - vadduwm v4, v2, v3 - vrlw v4, v4, v18 - vsldoi v3, v3, v3, 12 - vxor v1, v1, v4 - - vadduwm v4, v1, v2 - vrlw v4, v4, v19 - vsldoi v1, v1, v1, 4 - vxor v0, v0, v4 - - vadduwm v4, v0, v3 - vrlw v4, v4, v16 - vsldoi v2, v2, v2, 8 - vxor v1, v1, v4 - - vadduwm v4, v1, v0 - vrlw v4, v4, v17 - vxor v2, v2, v4 - - vadduwm v4, v2, v1 - vrlw v4, v4, v18 - vsldoi v1, v1, v1, 12 - vxor v3, v3, v4 - - vadduwm v4, v3, v2 - vrlw v4, v4, v19 - vsldoi v3, v3, v3, 4 - vxor v0, v0, v4 - vsldoi v2, v2, v2, 8 -.endm -.macro salsa8_core - salsa8_core_doubleround - salsa8_core_doubleround - salsa8_core_doubleround - salsa8_core_doubleround -.endm #ifdef _AIX .csect .text[PR] @@ -239,77 +194,160 @@ scrypt_core_loop1: vor v3, v11, v11 stvx v15, r4, r12 - salsa8_core + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 - vadduwm v8, v8, v0 - vadduwm v9, v9, v1 - vadduwm v10, v10, v2 - vadduwm v11, v11, v3 + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 - vxor v12, v12, v8 - vxor v13, v13, v9 - vxor v14, v14, v10 - vxor v15, v15, v11 - vor v0, v12, v12 - vor v1, v13, v13 - vor v2, v14, v14 - vor v3, v15, v15 + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 - salsa8_core + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 - vadduwm v12, v12, v0 - vadduwm v13, v13, v1 - vadduwm v14, v14, v2 - vadduwm v15, v15, v3 + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 - addi r4, r4, 32*4 - bdnz scrypt_core_loop1 + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 - stvx v12, 0, r3 - slwi r6, r5, 7 - subf r4, r6, r4 - mtctr r5 - addi r5, r5, -1 - addi r7, r4, 1*16 - addi r8, r4, 2*16 - addi r9, r4, 3*16 -scrypt_core_loop2: - lwz r6, 0(r3) - and r6, r6, r5 - slwi r6, r6, 7 - lvx v0, r4, r6 - vxor v8, v8, v12 - lvx v1, r7, r6 - vxor v9, v9, v13 - lvx v2, r8, r6 - vxor v10, v10, v14 - lvx v3, r9, r6 - vxor v11, v11, v15 - vxor v0, v0, v8 - vxor v1, v1, v9 - vxor v2, v2, v10 - vxor v3, v3, v11 - addi r6, r6, 64 - vor v8, v0, v0 - vor v9, v1, v1 - lvx v5, r4, r6 - vor v10, v2, v2 - lvx v6, r7, r6 - vor v11, v3, v3 - lvx v7, r8, r6 + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 - salsa8_core + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 vadduwm v8, v8, v0 - lvx v0, r9, r6 vadduwm v9, v9, v1 vadduwm v10, v10, v2 vadduwm v11, v11, v3 - vxor v12, v12, v5 - vxor v13, v13, v6 - vxor v14, v14, v7 - vxor v15, v15, v0 vxor v12, v12, v8 vxor v13, v13, v9 vxor v14, v14, v10 @@ -319,294 +357,2297 @@ scrypt_core_loop2: vor v2, v14, v14 vor v3, v15, v15 - salsa8_core - - vadduwm v12, v12, v0 - stvx v12, 0, r3 - vadduwm v13, v13, v1 - vadduwm v14, v14, v2 - vadduwm v15, v15, v3 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 - bdnz scrypt_core_loop2 + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 - vxor v0, v0, v0 - vnor v1, v0, v0 - vsldoi v2, v0, v1, 4 - vsldoi v3, v2, v0, 8 - vor v3, v3, v2 - vsldoi v1, v0, v1, 8 + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 - vor v4, v8, v8 - vsel v8, v8, v9, v3 - vsel v9, v9, v10, v3 - vsel v10, v10, v11, v3 - vsel v11, v11, v4, v3 - vor v4, v8, v8 - vor v5, v9, v9 - vsel v8, v8, v10, v1 - vsel v9, v11, v9, v1 - vsel v10, v10, v4, v1 - vsel v11, v5, v11, v1 + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 - vor v4, v12, v12 - vsel v12, v12, v13, v3 - vsel v13, v13, v14, v3 - vsel v14, v14, v15, v3 - vsel v15, v15, v4, v3 - vor v4, v12, v12 - vor v5, v13, v13 - vsel v12, v12, v14, v1 - vsel v13, v15, v13, v1 - vsel v14, v14, v4, v1 - vsel v15, v5, v15, v1 + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 - li r6, 1*16 - li r7, 2*16 - li r8, 3*16 - li r9, 4*16 + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 - stvx v8, 0, r3 - stvx v9, r3, r6 - stvx v10, r3, r7 - stvx v11, r3, r8 - stvx v12, r3, r9 - stvx v13, r3, r10 - stvx v14, r3, r11 - stvx v15, r3, r12 + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 - ld r0, 2*4(r1) - mtspr 256, r0 - addi r1, r1, 4*4 - blr - -#else /* __ALTIVEC__ */ - -.macro salsa8_core_doubleround - add r0, r16, r28 - add r5, r21, r17 - add r6, r26, r22 - add r7, r31, r27 - rotlwi r0, r0, 7 - rotlwi r5, r5, 7 - rotlwi r6, r6, 7 - rotlwi r7, r7, 7 - xor r20, r20, r0 - xor r25, r25, r5 - xor r30, r30, r6 - xor r19, r19, r7 + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 - add r0, r20, r16 - add r5, r25, r21 - add r6, r30, r26 - add r7, r19, r31 - rotlwi r0, r0, 9 - rotlwi r5, r5, 9 - rotlwi r6, r6, 9 - rotlwi r7, r7, 9 - xor r24, r24, r0 - xor r29, r29, r5 - xor r18, r18, r6 - xor r23, r23, r7 + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 - add r0, r24, r20 - add r5, r29, r25 - add r6, r18, r30 - add r7, r23, r19 - rotlwi r0, r0, 13 - rotlwi r5, r5, 13 - rotlwi r6, r6, 13 - rotlwi r7, r7, 13 - xor r28, r28, r0 - xor r17, r17, r5 - xor r22, r22, r6 - xor r27, r27, r7 + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 - add r0, r28, r24 - add r5, r17, r29 - add r6, r22, r18 - add r7, r27, r23 - rotlwi r0, r0, 18 - rotlwi r5, r5, 18 - rotlwi r6, r6, 18 - rotlwi r7, r7, 18 - xor r16, r16, r0 - xor r21, r21, r5 - xor r26, r26, r6 - xor r31, r31, r7 + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 - add r0, r16, r19 - add r5, r21, r20 - add r6, r26, r25 - add r7, r31, r30 - rotlwi r0, r0, 7 - rotlwi r5, r5, 7 - rotlwi r6, r6, 7 - rotlwi r7, r7, 7 - xor r17, r17, r0 - xor r22, r22, r5 - xor r27, r27, r6 - xor r28, r28, r7 + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 - add r0, r17, r16 - add r5, r22, r21 - add r6, r27, r26 - add r7, r28, r31 - rotlwi r0, r0, 9 - rotlwi r5, r5, 9 - rotlwi r6, r6, 9 - rotlwi r7, r7, 9 - xor r18, r18, r0 - xor r23, r23, r5 - xor r24, r24, r6 - xor r29, r29, r7 + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 - add r0, r18, r17 - add r5, r23, r22 - add r6, r24, r27 - add r7, r29, r28 - rotlwi r0, r0, 13 - rotlwi r5, r5, 13 - rotlwi r6, r6, 13 - rotlwi r7, r7, 13 - xor r19, r19, r0 - xor r20, r20, r5 - xor r25, r25, r6 - xor r30, r30, r7 + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 - add r0, r19, r18 - add r5, r20, r23 - add r6, r25, r24 - add r7, r30, r29 - rotlwi r0, r0, 18 - rotlwi r5, r5, 18 - rotlwi r6, r6, 18 - rotlwi r7, r7, 18 - xor r16, r16, r0 - xor r21, r21, r5 - xor r26, r26, r6 - xor r31, r31, r7 -.endm - -.macro salsa8_core - salsa8_core_doubleround - salsa8_core_doubleround - salsa8_core_doubleround - salsa8_core_doubleround -.endm - -#ifdef _AIX - .csect .text[PR] -#else - .text -#endif - .align 2 - .globl scrypt_core - .globl _scrypt_core - .globl .scrypt_core -#ifdef __ELF__ - .type scrypt_core, %function -#endif -scrypt_core: -_scrypt_core: -.scrypt_core: - stdu r1, -68*4(r1) - stw r5, 2*4(r1) - std r13, 4*4(r1) - std r14, 6*4(r1) - std r15, 8*4(r1) - std r16, 10*4(r1) - std r17, 12*4(r1) - std r18, 14*4(r1) - std r19, 16*4(r1) - std r20, 18*4(r1) - std r21, 20*4(r1) - std r3, 22*4(r1) - std r22, 48*4(r1) - std r23, 50*4(r1) - std r24, 52*4(r1) - std r25, 54*4(r1) - std r26, 56*4(r1) - std r27, 58*4(r1) - std r28, 60*4(r1) - std r29, 62*4(r1) - std r30, 64*4(r1) - std r31, 66*4(r1) + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 - lwz r16, 0*4(r3) - lwz r17, 1*4(r3) - lwz r18, 2*4(r3) - lwz r19, 3*4(r3) - lwz r20, 4*4(r3) - lwz r21, 5*4(r3) - lwz r22, 6*4(r3) - lwz r23, 7*4(r3) - stw r16, 24*4(r1) - stw r17, 25*4(r1) - stw r18, 26*4(r1) - stw r19, 27*4(r1) - stw r20, 28*4(r1) - stw r21, 29*4(r1) - stw r22, 30*4(r1) - stw r23, 31*4(r1) - lwz r24, 8*4(r3) - lwz r25, 9*4(r3) - lwz r26, 10*4(r3) - lwz r27, 11*4(r3) - lwz r28, 12*4(r3) - lwz r29, 13*4(r3) - lwz r30, 14*4(r3) - lwz r31, 15*4(r3) - stw r24, 32*4(r1) - stw r25, 33*4(r1) - stw r26, 34*4(r1) + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + + vadduwm v12, v12, v0 + vadduwm v13, v13, v1 + vadduwm v14, v14, v2 + vadduwm v15, v15, v3 + + addi r4, r4, 32*4 + bdnz scrypt_core_loop1 + + stvx v12, 0, r3 + slwi r6, r5, 7 + subf r4, r6, r4 + mtctr r5 + addi r5, r5, -1 + addi r7, r4, 1*16 + addi r8, r4, 2*16 + addi r9, r4, 3*16 +scrypt_core_loop2: + lwz r6, 0(r3) + and r6, r6, r5 + slwi r6, r6, 7 + lvx v0, r4, r6 + vxor v8, v8, v12 + lvx v1, r7, r6 + vxor v9, v9, v13 + lvx v2, r8, r6 + vxor v10, v10, v14 + lvx v3, r9, r6 + vxor v11, v11, v15 + vxor v0, v0, v8 + vxor v1, v1, v9 + vxor v2, v2, v10 + vxor v3, v3, v11 + addi r6, r6, 64 + vor v8, v0, v0 + vor v9, v1, v1 + lvx v5, r4, r6 + vor v10, v2, v2 + lvx v6, r7, r6 + vor v11, v3, v3 + lvx v7, r8, r6 + + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + + vadduwm v8, v8, v0 + lvx v0, r9, r6 + vadduwm v9, v9, v1 + vadduwm v10, v10, v2 + vadduwm v11, v11, v3 + + vxor v12, v12, v5 + vxor v13, v13, v6 + vxor v14, v14, v7 + vxor v15, v15, v0 + vxor v12, v12, v8 + vxor v13, v13, v9 + vxor v14, v14, v10 + vxor v15, v15, v11 + vor v0, v12, v12 + vor v1, v13, v13 + vor v2, v14, v14 + vor v3, v15, v15 + + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 + + vadduwm v12, v12, v0 + stvx v12, 0, r3 + vadduwm v13, v13, v1 + vadduwm v14, v14, v2 + vadduwm v15, v15, v3 + + bdnz scrypt_core_loop2 + + vxor v0, v0, v0 + vnor v1, v0, v0 + vsldoi v2, v0, v1, 4 + vsldoi v3, v2, v0, 8 + vor v3, v3, v2 + vsldoi v1, v0, v1, 8 + + vor v4, v8, v8 + vsel v8, v8, v9, v3 + vsel v9, v9, v10, v3 + vsel v10, v10, v11, v3 + vsel v11, v11, v4, v3 + vor v4, v8, v8 + vor v5, v9, v9 + vsel v8, v8, v10, v1 + vsel v9, v11, v9, v1 + vsel v10, v10, v4, v1 + vsel v11, v5, v11, v1 + + vor v4, v12, v12 + vsel v12, v12, v13, v3 + vsel v13, v13, v14, v3 + vsel v14, v14, v15, v3 + vsel v15, v15, v4, v3 + vor v4, v12, v12 + vor v5, v13, v13 + vsel v12, v12, v14, v1 + vsel v13, v15, v13, v1 + vsel v14, v14, v4, v1 + vsel v15, v5, v15, v1 + + li r6, 1*16 + li r7, 2*16 + li r8, 3*16 + li r9, 4*16 + + stvx v8, 0, r3 + stvx v9, r3, r6 + stvx v10, r3, r7 + stvx v11, r3, r8 + stvx v12, r3, r9 + stvx v13, r3, r10 + stvx v14, r3, r11 + stvx v15, r3, r12 + + ld r0, 2*4(r1) + mtspr 256, r0 + addi r1, r1, 4*4 + blr + +#else /* __ALTIVEC__ */ + + + +#ifdef _AIX + .csect .text[PR] +#else + .text +#endif + .align 2 + .globl scrypt_core + .globl _scrypt_core + .globl .scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: +.scrypt_core: + stdu r1, -68*4(r1) + stw r5, 2*4(r1) + std r13, 4*4(r1) + std r14, 6*4(r1) + std r15, 8*4(r1) + std r16, 10*4(r1) + std r17, 12*4(r1) + std r18, 14*4(r1) + std r19, 16*4(r1) + std r20, 18*4(r1) + std r21, 20*4(r1) + std r3, 22*4(r1) + std r22, 48*4(r1) + std r23, 50*4(r1) + std r24, 52*4(r1) + std r25, 54*4(r1) + std r26, 56*4(r1) + std r27, 58*4(r1) + std r28, 60*4(r1) + std r29, 62*4(r1) + std r30, 64*4(r1) + std r31, 66*4(r1) + + lwz r16, 0*4(r3) + lwz r17, 1*4(r3) + lwz r18, 2*4(r3) + lwz r19, 3*4(r3) + lwz r20, 4*4(r3) + lwz r21, 5*4(r3) + lwz r22, 6*4(r3) + lwz r23, 7*4(r3) + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + lwz r24, 8*4(r3) + lwz r25, 9*4(r3) + lwz r26, 10*4(r3) + lwz r27, 11*4(r3) + lwz r28, 12*4(r3) + lwz r29, 13*4(r3) + lwz r30, 14*4(r3) + lwz r31, 15*4(r3) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + lwz r16, 16*4(r3) + lwz r17, 17*4(r3) + lwz r18, 18*4(r3) + lwz r19, 19*4(r3) + lwz r20, 20*4(r3) + lwz r21, 21*4(r3) + lwz r22, 22*4(r3) + lwz r23, 23*4(r3) + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + lwz r8, 24*4(r3) + lwz r9, 25*4(r3) + lwz r10, 26*4(r3) + lwz r11, 27*4(r3) + lwz r12, 28*4(r3) + lwz r13, 29*4(r3) + lwz r14, 30*4(r3) + lwz r15, 31*4(r3) + + mtctr r5 +scrypt_core_loop1: + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + stw r16, 0*4(r4) + stw r17, 1*4(r4) + stw r18, 2*4(r4) + stw r19, 3*4(r4) + stw r0, 16*4(r4) + stw r5, 17*4(r4) + stw r6, 18*4(r4) + stw r7, 19*4(r4) + + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + stw r0, 20*4(r4) + stw r5, 21*4(r4) + stw r6, 22*4(r4) + stw r7, 23*4(r4) + stw r20, 4*4(r4) + stw r21, 5*4(r4) + stw r22, 6*4(r4) + stw r23, 7*4(r4) + + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r24, 8*4(r4) + stw r25, 9*4(r4) + stw r26, 10*4(r4) + stw r27, 11*4(r4) + stw r28, 12*4(r4) + stw r29, 13*4(r4) + stw r30, 14*4(r4) + stw r31, 15*4(r4) + stw r8, 24*4(r4) + stw r9, 25*4(r4) + stw r10, 26*4(r4) + stw r11, 27*4(r4) + stw r12, 28*4(r4) + stw r13, 29*4(r4) + stw r14, 30*4(r4) + stw r15, 31*4(r4) + + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + lwz r0, 0*4(r4) + lwz r5, 1*4(r4) + lwz r6, 2*4(r4) + lwz r7, 3*4(r4) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 4*4(r4) + lwz r5, 5*4(r4) + lwz r6, 6*4(r4) + lwz r7, 7*4(r4) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r0, 8*4(r4) + lwz r5, 9*4(r4) + lwz r6, 10*4(r4) + lwz r7, 11*4(r4) + add r24, r24, r0 + add r25, r25, r5 + add r26, r26, r6 + add r27, r27, r7 + lwz r0, 12*4(r4) + lwz r5, 13*4(r4) + lwz r6, 14*4(r4) + lwz r7, 15*4(r4) + add r28, r28, r0 + add r29, r29, r5 + add r30, r30, r6 + add r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + mr r8, r24 + mr r9, r25 + mr r10, r26 + mr r11, r27 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + mr r12, r28 + mr r13, r29 + mr r14, r30 + mr r15, r31 + + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + add r8, r8, r24 + add r9, r9, r25 + add r10, r10, r26 + add r11, r11, r27 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + add r12, r12, r28 + add r13, r13, r29 + add r14, r14, r30 + add r15, r15, r31 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + + addi r4, r4, 32*4 + bdnz scrypt_core_loop1 + + lwz r5, 2*4(r1) + slwi r3, r5, 7 + subf r4, r3, r4 + mtctr r5 + addi r5, r5, -1 + stw r5, 2*4(r1) +scrypt_core_loop2: + and r3, r16, r5 + slwi r3, r3, 7 + add r3, r3, r4 + mr r0, r16 + mr r5, r17 + mr r6, r18 + mr r7, r19 + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + + lwz r0, 0*4(r3) + lwz r5, 1*4(r3) + lwz r6, 2*4(r3) + lwz r7, 3*4(r3) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 4*4(r3) + lwz r5, 5*4(r3) + lwz r6, 6*4(r3) + lwz r7, 7*4(r3) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + lwz r0, 8*4(r3) + lwz r5, 9*4(r3) + lwz r6, 10*4(r3) + lwz r7, 11*4(r3) + xor r24, r24, r0 + xor r25, r25, r5 + xor r26, r26, r6 + xor r27, r27, r7 + lwz r0, 12*4(r3) + lwz r5, 13*4(r3) + lwz r6, 14*4(r3) + lwz r7, 15*4(r3) + xor r28, r28, r0 + xor r29, r29, r5 + xor r30, r30, r6 + xor r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + lwz r0, 24*4(r1) + lwz r5, 25*4(r1) + lwz r6, 26*4(r1) + lwz r7, 27*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 28*4(r1) + lwz r5, 29*4(r1) + lwz r6, 30*4(r1) + lwz r7, 31*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r0, 32*4(r1) + lwz r5, 33*4(r1) + lwz r6, 34*4(r1) + lwz r7, 35*4(r1) + add r24, r24, r0 + add r25, r25, r5 + add r26, r26, r6 + add r27, r27, r7 + lwz r0, 36*4(r1) + lwz r5, 37*4(r1) + lwz r6, 38*4(r1) + lwz r7, 39*4(r1) + add r28, r28, r0 + add r29, r29, r5 + add r30, r30, r6 + add r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) stw r27, 35*4(r1) stw r28, 36*4(r1) stw r29, 37*4(r1) stw r30, 38*4(r1) stw r31, 39*4(r1) - lwz r16, 16*4(r3) - lwz r17, 17*4(r3) - lwz r18, 18*4(r3) - lwz r19, 19*4(r3) - lwz r20, 20*4(r3) - lwz r21, 21*4(r3) - lwz r22, 22*4(r3) - lwz r23, 23*4(r3) - stw r16, 40*4(r1) - stw r17, 41*4(r1) - stw r18, 42*4(r1) - stw r19, 43*4(r1) - stw r20, 44*4(r1) - stw r21, 45*4(r1) - stw r22, 46*4(r1) - stw r23, 47*4(r1) - lwz r8, 24*4(r3) - lwz r9, 25*4(r3) - lwz r10, 26*4(r3) - lwz r11, 27*4(r3) - lwz r12, 28*4(r3) - lwz r13, 29*4(r3) - lwz r14, 30*4(r3) - lwz r15, 31*4(r3) - mtctr r5 -scrypt_core_loop1: - lwz r16, 24*4(r1) - lwz r17, 25*4(r1) - lwz r18, 26*4(r1) - lwz r19, 27*4(r1) - lwz r20, 28*4(r1) - lwz r21, 29*4(r1) - lwz r22, 30*4(r1) - lwz r23, 31*4(r1) - lwz r24, 32*4(r1) - lwz r25, 33*4(r1) - lwz r26, 34*4(r1) - lwz r27, 35*4(r1) - lwz r28, 36*4(r1) - lwz r29, 37*4(r1) - lwz r30, 38*4(r1) - lwz r31, 39*4(r1) + lwz r0, 16*4(r3) + lwz r5, 17*4(r3) + lwz r6, 18*4(r3) + lwz r7, 19*4(r3) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 20*4(r3) + lwz r5, 21*4(r3) + lwz r6, 22*4(r3) + lwz r7, 23*4(r3) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + lwz r0, 24*4(r3) + lwz r5, 25*4(r3) + lwz r6, 26*4(r3) + lwz r7, 27*4(r3) + xor r24, r24, r0 + xor r25, r25, r5 + xor r26, r26, r6 + xor r27, r27, r7 + lwz r0, 28*4(r3) + lwz r5, 29*4(r3) + lwz r6, 30*4(r3) + lwz r7, 31*4(r3) + xor r28, r28, r0 + xor r29, r29, r5 + xor r30, r30, r6 + xor r31, r31, r7 lwz r0, 40*4(r1) lwz r5, 41*4(r1) @@ -616,15 +2657,6 @@ scrypt_core_loop1: xor r17, r17, r5 xor r18, r18, r6 xor r19, r19, r7 - stw r16, 0*4(r4) - stw r17, 1*4(r4) - stw r18, 2*4(r4) - stw r19, 3*4(r4) - stw r0, 16*4(r4) - stw r5, 17*4(r4) - stw r6, 18*4(r4) - stw r7, 19*4(r4) - lwz r0, 44*4(r1) lwz r5, 45*4(r1) lwz r6, 46*4(r1) @@ -633,15 +2665,6 @@ scrypt_core_loop1: xor r21, r21, r5 xor r22, r22, r6 xor r23, r23, r7 - stw r0, 20*4(r4) - stw r5, 21*4(r4) - stw r6, 22*4(r4) - stw r7, 23*4(r4) - stw r20, 4*4(r4) - stw r21, 5*4(r4) - stw r22, 6*4(r4) - stw r23, 7*4(r4) - xor r24, r24, r8 xor r25, r25, r9 xor r26, r26, r10 @@ -650,383 +2673,436 @@ scrypt_core_loop1: xor r29, r29, r13 xor r30, r30, r14 xor r31, r31, r15 - stw r24, 8*4(r4) - stw r25, 9*4(r4) - stw r26, 10*4(r4) - stw r27, 11*4(r4) - stw r28, 12*4(r4) - stw r29, 13*4(r4) - stw r30, 14*4(r4) - stw r31, 15*4(r4) - stw r8, 24*4(r4) - stw r9, 25*4(r4) - stw r10, 26*4(r4) - stw r11, 27*4(r4) - stw r12, 28*4(r4) - stw r13, 29*4(r4) - stw r14, 30*4(r4) - stw r15, 31*4(r4) + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + mr r8, r24 + mr r9, r25 + mr r10, r26 + mr r11, r27 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + mr r12, r28 + mr r13, r29 + mr r14, r30 + mr r15, r31 + + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 - salsa8_core + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 - lwz r0, 0*4(r4) - lwz r5, 1*4(r4) - lwz r6, 2*4(r4) - lwz r7, 3*4(r4) - add r16, r16, r0 - add r17, r17, r5 - add r18, r18, r6 - add r19, r19, r7 - lwz r0, 4*4(r4) - lwz r5, 5*4(r4) - lwz r6, 6*4(r4) - lwz r7, 7*4(r4) - add r20, r20, r0 - add r21, r21, r5 - add r22, r22, r6 - add r23, r23, r7 - lwz r0, 8*4(r4) - lwz r5, 9*4(r4) - lwz r6, 10*4(r4) - lwz r7, 11*4(r4) - add r24, r24, r0 - add r25, r25, r5 - add r26, r26, r6 - add r27, r27, r7 - lwz r0, 12*4(r4) - lwz r5, 13*4(r4) - lwz r6, 14*4(r4) - lwz r7, 15*4(r4) - add r28, r28, r0 - add r29, r29, r5 - add r30, r30, r6 - add r31, r31, r7 + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 - stw r16, 24*4(r1) - stw r17, 25*4(r1) - stw r18, 26*4(r1) - stw r19, 27*4(r1) - stw r20, 28*4(r1) - stw r21, 29*4(r1) - stw r22, 30*4(r1) - stw r23, 31*4(r1) - stw r24, 32*4(r1) - stw r25, 33*4(r1) - stw r26, 34*4(r1) - stw r27, 35*4(r1) - stw r28, 36*4(r1) - stw r29, 37*4(r1) - stw r30, 38*4(r1) - stw r31, 39*4(r1) + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 - lwz r0, 40*4(r1) - lwz r5, 41*4(r1) - lwz r6, 42*4(r1) - lwz r7, 43*4(r1) + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 xor r16, r16, r0 - xor r17, r17, r5 - xor r18, r18, r6 - xor r19, r19, r7 - lwz r0, 44*4(r1) - lwz r5, 45*4(r1) - lwz r6, 46*4(r1) - lwz r7, 47*4(r1) - xor r20, r20, r0 xor r21, r21, r5 - xor r22, r22, r6 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 xor r23, r23, r7 - xor r24, r24, r8 - xor r25, r25, r9 - xor r26, r26, r10 - xor r27, r27, r11 - xor r28, r28, r12 - xor r29, r29, r13 - xor r30, r30, r14 - xor r31, r31, r15 - stw r16, 40*4(r1) - stw r17, 41*4(r1) - stw r18, 42*4(r1) - stw r19, 43*4(r1) - mr r8, r24 - mr r9, r25 - mr r10, r26 - mr r11, r27 - stw r20, 44*4(r1) - stw r21, 45*4(r1) - stw r22, 46*4(r1) - stw r23, 47*4(r1) - mr r12, r28 - mr r13, r29 - mr r14, r30 - mr r15, r31 - salsa8_core + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 - lwz r0, 40*4(r1) - lwz r5, 41*4(r1) - lwz r6, 42*4(r1) - lwz r7, 43*4(r1) - add r16, r16, r0 - add r17, r17, r5 - add r18, r18, r6 - add r19, r19, r7 - lwz r0, 44*4(r1) - lwz r5, 45*4(r1) - lwz r6, 46*4(r1) - lwz r7, 47*4(r1) - add r20, r20, r0 - add r21, r21, r5 - add r22, r22, r6 - add r23, r23, r7 - add r8, r8, r24 - add r9, r9, r25 - add r10, r10, r26 - add r11, r11, r27 - stw r16, 40*4(r1) - stw r17, 41*4(r1) - stw r18, 42*4(r1) - stw r19, 43*4(r1) - add r12, r12, r28 - add r13, r13, r29 - add r14, r14, r30 - add r15, r15, r31 - stw r20, 44*4(r1) - stw r21, 45*4(r1) - stw r22, 46*4(r1) - stw r23, 47*4(r1) + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 - addi r4, r4, 32*4 - bdnz scrypt_core_loop1 + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 - lwz r5, 2*4(r1) - slwi r3, r5, 7 - subf r4, r3, r4 - mtctr r5 - addi r5, r5, -1 - stw r5, 2*4(r1) -scrypt_core_loop2: - and r3, r16, r5 - slwi r3, r3, 7 - add r3, r3, r4 - mr r0, r16 - mr r5, r17 - mr r6, r18 - mr r7, r19 - lwz r16, 24*4(r1) - lwz r17, 25*4(r1) - lwz r18, 26*4(r1) - lwz r19, 27*4(r1) - lwz r20, 28*4(r1) - lwz r21, 29*4(r1) - lwz r22, 30*4(r1) - lwz r23, 31*4(r1) - lwz r24, 32*4(r1) - lwz r25, 33*4(r1) - lwz r26, 34*4(r1) - lwz r27, 35*4(r1) - lwz r28, 36*4(r1) - lwz r29, 37*4(r1) - lwz r30, 38*4(r1) - lwz r31, 39*4(r1) + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 xor r16, r16, r0 - xor r17, r17, r5 - xor r18, r18, r6 - xor r19, r19, r7 - lwz r0, 44*4(r1) - lwz r5, 45*4(r1) - lwz r6, 46*4(r1) - lwz r7, 47*4(r1) + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 xor r20, r20, r0 - xor r21, r21, r5 - xor r22, r22, r6 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 xor r23, r23, r7 - xor r24, r24, r8 - xor r25, r25, r9 - xor r26, r26, r10 - xor r27, r27, r11 - xor r28, r28, r12 - xor r29, r29, r13 - xor r30, r30, r14 - xor r31, r31, r15 - lwz r0, 0*4(r3) - lwz r5, 1*4(r3) - lwz r6, 2*4(r3) - lwz r7, 3*4(r3) - xor r16, r16, r0 + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 xor r17, r17, r5 - xor r18, r18, r6 - xor r19, r19, r7 - lwz r0, 4*4(r3) - lwz r5, 5*4(r3) - lwz r6, 6*4(r3) - lwz r7, 7*4(r3) - xor r20, r20, r0 - xor r21, r21, r5 xor r22, r22, r6 - xor r23, r23, r7 - lwz r0, 8*4(r3) - lwz r5, 9*4(r3) - lwz r6, 10*4(r3) - lwz r7, 11*4(r3) - xor r24, r24, r0 - xor r25, r25, r5 - xor r26, r26, r6 xor r27, r27, r7 - lwz r0, 12*4(r3) - lwz r5, 13*4(r3) - lwz r6, 14*4(r3) - lwz r7, 15*4(r3) - xor r28, r28, r0 - xor r29, r29, r5 - xor r30, r30, r6 - xor r31, r31, r7 - stw r16, 24*4(r1) - stw r17, 25*4(r1) - stw r18, 26*4(r1) - stw r19, 27*4(r1) - stw r20, 28*4(r1) - stw r21, 29*4(r1) - stw r22, 30*4(r1) - stw r23, 31*4(r1) - stw r24, 32*4(r1) - stw r25, 33*4(r1) - stw r26, 34*4(r1) - stw r27, 35*4(r1) - stw r28, 36*4(r1) - stw r29, 37*4(r1) - stw r30, 38*4(r1) - stw r31, 39*4(r1) + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 - salsa8_core + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 - lwz r0, 24*4(r1) - lwz r5, 25*4(r1) - lwz r6, 26*4(r1) - lwz r7, 27*4(r1) - add r16, r16, r0 - add r17, r17, r5 - add r18, r18, r6 - add r19, r19, r7 - lwz r0, 28*4(r1) - lwz r5, 29*4(r1) - lwz r6, 30*4(r1) - lwz r7, 31*4(r1) - add r20, r20, r0 - add r21, r21, r5 - add r22, r22, r6 - add r23, r23, r7 - lwz r0, 32*4(r1) - lwz r5, 33*4(r1) - lwz r6, 34*4(r1) - lwz r7, 35*4(r1) - add r24, r24, r0 - add r25, r25, r5 - add r26, r26, r6 - add r27, r27, r7 - lwz r0, 36*4(r1) - lwz r5, 37*4(r1) - lwz r6, 38*4(r1) - lwz r7, 39*4(r1) - add r28, r28, r0 - add r29, r29, r5 - add r30, r30, r6 - add r31, r31, r7 + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 - stw r16, 24*4(r1) - stw r17, 25*4(r1) - stw r18, 26*4(r1) - stw r19, 27*4(r1) - stw r20, 28*4(r1) - stw r21, 29*4(r1) - stw r22, 30*4(r1) - stw r23, 31*4(r1) - stw r24, 32*4(r1) - stw r25, 33*4(r1) - stw r26, 34*4(r1) - stw r27, 35*4(r1) - stw r28, 36*4(r1) - stw r29, 37*4(r1) - stw r30, 38*4(r1) - stw r31, 39*4(r1) + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 - lwz r0, 16*4(r3) - lwz r5, 17*4(r3) - lwz r6, 18*4(r3) - lwz r7, 19*4(r3) + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 xor r16, r16, r0 - xor r17, r17, r5 - xor r18, r18, r6 - xor r19, r19, r7 - lwz r0, 20*4(r3) - lwz r5, 21*4(r3) - lwz r6, 22*4(r3) - lwz r7, 23*4(r3) - xor r20, r20, r0 xor r21, r21, r5 - xor r22, r22, r6 - xor r23, r23, r7 - lwz r0, 24*4(r3) - lwz r5, 25*4(r3) - lwz r6, 26*4(r3) - lwz r7, 27*4(r3) - xor r24, r24, r0 - xor r25, r25, r5 xor r26, r26, r6 - xor r27, r27, r7 - lwz r0, 28*4(r3) - lwz r5, 29*4(r3) - lwz r6, 30*4(r3) - lwz r7, 31*4(r3) - xor r28, r28, r0 - xor r29, r29, r5 - xor r30, r30, r6 xor r31, r31, r7 - lwz r0, 40*4(r1) - lwz r5, 41*4(r1) - lwz r6, 42*4(r1) - lwz r7, 43*4(r1) - xor r16, r16, r0 - xor r17, r17, r5 - xor r18, r18, r6 - xor r19, r19, r7 - lwz r0, 44*4(r1) - lwz r5, 45*4(r1) - lwz r6, 46*4(r1) - lwz r7, 47*4(r1) - xor r20, r20, r0 - xor r21, r21, r5 - xor r22, r22, r6 - xor r23, r23, r7 - xor r24, r24, r8 - xor r25, r25, r9 - xor r26, r26, r10 - xor r27, r27, r11 - xor r28, r28, r12 - xor r29, r29, r13 - xor r30, r30, r14 - xor r31, r31, r15 - stw r16, 40*4(r1) - stw r17, 41*4(r1) - stw r18, 42*4(r1) - stw r19, 43*4(r1) - mr r8, r24 - mr r9, r25 - mr r10, r26 - mr r11, r27 - stw r20, 44*4(r1) - stw r21, 45*4(r1) - stw r22, 46*4(r1) - stw r23, 47*4(r1) - mr r12, r28 - mr r13, r29 - mr r14, r30 - mr r15, r31 - - salsa8_core - lwz r0, 40*4(r1) lwz r5, 41*4(r1) lwz r6, 42*4(r1) diff --git a/scrypt-ppc.S.orig b/scrypt-ppc.S.orig new file mode 100644 index 000000000..47ef643ec --- /dev/null +++ b/scrypt-ppc.S.orig @@ -0,0 +1,1148 @@ +/* + * Copyright 2014-2015 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) + +#ifndef __APPLE__ + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#ifdef __ALTIVEC__ +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#endif + +#endif + +#if !(defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) || \ + defined(__64BIT__) || defined(_LP64) || defined(__LP64__)) +#define ld lwz +#define std stw +#define stdu stwu +#define stdux stwux +#endif + + +#ifdef __ALTIVEC__ + +#ifdef __APPLE__ + .machine ppc7400 +#endif + +.macro salsa8_core_doubleround + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 +.endm + +.macro salsa8_core + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround +.endm + +#ifdef _AIX + .csect .text[PR] +#else + .text +#endif + .align 2 + .globl scrypt_core + .globl _scrypt_core + .globl .scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: +.scrypt_core: + stdu r1, -4*4(r1) + mfspr r0, 256 + std r0, 2*4(r1) + oris r0, r0, 0xffff + ori r0, r0, 0xf000 + mtspr 256, r0 + + li r6, 1*16 + li r7, 2*16 + li r8, 3*16 + li r9, 4*16 + li r10, 5*16 + li r11, 6*16 + li r12, 7*16 + + lvx v8, 0, r3 + lvx v9, r3, r6 + lvx v10, r3, r7 + lvx v11, r3, r8 + lvx v12, r3, r9 + lvx v13, r3, r10 + lvx v14, r3, r11 + lvx v15, r3, r12 + + vxor v0, v0, v0 + vnor v1, v0, v0 + vsldoi v2, v0, v1, 4 + vsldoi v3, v2, v0, 8 + vor v3, v3, v2 + vsldoi v1, v0, v1, 8 + + vor v4, v8, v8 + vsel v8, v8, v9, v3 + vsel v9, v9, v10, v3 + vsel v10, v10, v11, v3 + vsel v11, v11, v4, v3 + vor v4, v8, v8 + vor v5, v9, v9 + vsel v8, v8, v10, v1 + vsel v9, v11, v9, v1 + vsel v10, v10, v4, v1 + vsel v11, v5, v11, v1 + + vor v4, v12, v12 + vsel v12, v12, v13, v3 + vsel v13, v13, v14, v3 + vsel v14, v14, v15, v3 + vsel v15, v15, v4, v3 + vor v4, v12, v12 + vor v5, v13, v13 + vsel v12, v12, v14, v1 + vsel v13, v15, v13, v1 + vsel v14, v14, v4, v1 + vsel v15, v5, v15, v1 + + vspltisw v16, 7 + vspltisw v17, 9 + vspltisw v18, 13 + vadduwm v19, v17, v17 + + mtctr r5 +scrypt_core_loop1: + vxor v8, v8, v12 + stvx v8, 0, r4 + vxor v9, v9, v13 + stvx v9, r4, r6 + vxor v10, v10, v14 + stvx v10, r4, r7 + vxor v11, v11, v15 + stvx v11, r4, r8 + vor v0, v8, v8 + stvx v12, r4, r9 + vor v1, v9, v9 + stvx v13, r4, r10 + vor v2, v10, v10 + stvx v14, r4, r11 + vor v3, v11, v11 + stvx v15, r4, r12 + + salsa8_core + + vadduwm v8, v8, v0 + vadduwm v9, v9, v1 + vadduwm v10, v10, v2 + vadduwm v11, v11, v3 + + vxor v12, v12, v8 + vxor v13, v13, v9 + vxor v14, v14, v10 + vxor v15, v15, v11 + vor v0, v12, v12 + vor v1, v13, v13 + vor v2, v14, v14 + vor v3, v15, v15 + + salsa8_core + + vadduwm v12, v12, v0 + vadduwm v13, v13, v1 + vadduwm v14, v14, v2 + vadduwm v15, v15, v3 + + addi r4, r4, 32*4 + bdnz scrypt_core_loop1 + + stvx v12, 0, r3 + slwi r6, r5, 7 + subf r4, r6, r4 + mtctr r5 + addi r5, r5, -1 + addi r7, r4, 1*16 + addi r8, r4, 2*16 + addi r9, r4, 3*16 +scrypt_core_loop2: + lwz r6, 0(r3) + and r6, r6, r5 + slwi r6, r6, 7 + lvx v0, r4, r6 + vxor v8, v8, v12 + lvx v1, r7, r6 + vxor v9, v9, v13 + lvx v2, r8, r6 + vxor v10, v10, v14 + lvx v3, r9, r6 + vxor v11, v11, v15 + vxor v0, v0, v8 + vxor v1, v1, v9 + vxor v2, v2, v10 + vxor v3, v3, v11 + addi r6, r6, 64 + vor v8, v0, v0 + vor v9, v1, v1 + lvx v5, r4, r6 + vor v10, v2, v2 + lvx v6, r7, r6 + vor v11, v3, v3 + lvx v7, r8, r6 + + salsa8_core + + vadduwm v8, v8, v0 + lvx v0, r9, r6 + vadduwm v9, v9, v1 + vadduwm v10, v10, v2 + vadduwm v11, v11, v3 + + vxor v12, v12, v5 + vxor v13, v13, v6 + vxor v14, v14, v7 + vxor v15, v15, v0 + vxor v12, v12, v8 + vxor v13, v13, v9 + vxor v14, v14, v10 + vxor v15, v15, v11 + vor v0, v12, v12 + vor v1, v13, v13 + vor v2, v14, v14 + vor v3, v15, v15 + + salsa8_core + + vadduwm v12, v12, v0 + stvx v12, 0, r3 + vadduwm v13, v13, v1 + vadduwm v14, v14, v2 + vadduwm v15, v15, v3 + + bdnz scrypt_core_loop2 + + vxor v0, v0, v0 + vnor v1, v0, v0 + vsldoi v2, v0, v1, 4 + vsldoi v3, v2, v0, 8 + vor v3, v3, v2 + vsldoi v1, v0, v1, 8 + + vor v4, v8, v8 + vsel v8, v8, v9, v3 + vsel v9, v9, v10, v3 + vsel v10, v10, v11, v3 + vsel v11, v11, v4, v3 + vor v4, v8, v8 + vor v5, v9, v9 + vsel v8, v8, v10, v1 + vsel v9, v11, v9, v1 + vsel v10, v10, v4, v1 + vsel v11, v5, v11, v1 + + vor v4, v12, v12 + vsel v12, v12, v13, v3 + vsel v13, v13, v14, v3 + vsel v14, v14, v15, v3 + vsel v15, v15, v4, v3 + vor v4, v12, v12 + vor v5, v13, v13 + vsel v12, v12, v14, v1 + vsel v13, v15, v13, v1 + vsel v14, v14, v4, v1 + vsel v15, v5, v15, v1 + + li r6, 1*16 + li r7, 2*16 + li r8, 3*16 + li r9, 4*16 + + stvx v8, 0, r3 + stvx v9, r3, r6 + stvx v10, r3, r7 + stvx v11, r3, r8 + stvx v12, r3, r9 + stvx v13, r3, r10 + stvx v14, r3, r11 + stvx v15, r3, r12 + + ld r0, 2*4(r1) + mtspr 256, r0 + addi r1, r1, 4*4 + blr + +#else /* __ALTIVEC__ */ + +.macro salsa8_core_doubleround + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 +.endm + +.macro salsa8_core + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround +.endm + +#ifdef _AIX + .csect .text[PR] +#else + .text +#endif + .align 2 + .globl scrypt_core + .globl _scrypt_core + .globl .scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: +.scrypt_core: + stdu r1, -68*4(r1) + stw r5, 2*4(r1) + std r13, 4*4(r1) + std r14, 6*4(r1) + std r15, 8*4(r1) + std r16, 10*4(r1) + std r17, 12*4(r1) + std r18, 14*4(r1) + std r19, 16*4(r1) + std r20, 18*4(r1) + std r21, 20*4(r1) + std r3, 22*4(r1) + std r22, 48*4(r1) + std r23, 50*4(r1) + std r24, 52*4(r1) + std r25, 54*4(r1) + std r26, 56*4(r1) + std r27, 58*4(r1) + std r28, 60*4(r1) + std r29, 62*4(r1) + std r30, 64*4(r1) + std r31, 66*4(r1) + + lwz r16, 0*4(r3) + lwz r17, 1*4(r3) + lwz r18, 2*4(r3) + lwz r19, 3*4(r3) + lwz r20, 4*4(r3) + lwz r21, 5*4(r3) + lwz r22, 6*4(r3) + lwz r23, 7*4(r3) + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + lwz r24, 8*4(r3) + lwz r25, 9*4(r3) + lwz r26, 10*4(r3) + lwz r27, 11*4(r3) + lwz r28, 12*4(r3) + lwz r29, 13*4(r3) + lwz r30, 14*4(r3) + lwz r31, 15*4(r3) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + lwz r16, 16*4(r3) + lwz r17, 17*4(r3) + lwz r18, 18*4(r3) + lwz r19, 19*4(r3) + lwz r20, 20*4(r3) + lwz r21, 21*4(r3) + lwz r22, 22*4(r3) + lwz r23, 23*4(r3) + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + lwz r8, 24*4(r3) + lwz r9, 25*4(r3) + lwz r10, 26*4(r3) + lwz r11, 27*4(r3) + lwz r12, 28*4(r3) + lwz r13, 29*4(r3) + lwz r14, 30*4(r3) + lwz r15, 31*4(r3) + + mtctr r5 +scrypt_core_loop1: + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + stw r16, 0*4(r4) + stw r17, 1*4(r4) + stw r18, 2*4(r4) + stw r19, 3*4(r4) + stw r0, 16*4(r4) + stw r5, 17*4(r4) + stw r6, 18*4(r4) + stw r7, 19*4(r4) + + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + stw r0, 20*4(r4) + stw r5, 21*4(r4) + stw r6, 22*4(r4) + stw r7, 23*4(r4) + stw r20, 4*4(r4) + stw r21, 5*4(r4) + stw r22, 6*4(r4) + stw r23, 7*4(r4) + + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r24, 8*4(r4) + stw r25, 9*4(r4) + stw r26, 10*4(r4) + stw r27, 11*4(r4) + stw r28, 12*4(r4) + stw r29, 13*4(r4) + stw r30, 14*4(r4) + stw r31, 15*4(r4) + stw r8, 24*4(r4) + stw r9, 25*4(r4) + stw r10, 26*4(r4) + stw r11, 27*4(r4) + stw r12, 28*4(r4) + stw r13, 29*4(r4) + stw r14, 30*4(r4) + stw r15, 31*4(r4) + + salsa8_core + + lwz r0, 0*4(r4) + lwz r5, 1*4(r4) + lwz r6, 2*4(r4) + lwz r7, 3*4(r4) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 4*4(r4) + lwz r5, 5*4(r4) + lwz r6, 6*4(r4) + lwz r7, 7*4(r4) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r0, 8*4(r4) + lwz r5, 9*4(r4) + lwz r6, 10*4(r4) + lwz r7, 11*4(r4) + add r24, r24, r0 + add r25, r25, r5 + add r26, r26, r6 + add r27, r27, r7 + lwz r0, 12*4(r4) + lwz r5, 13*4(r4) + lwz r6, 14*4(r4) + lwz r7, 15*4(r4) + add r28, r28, r0 + add r29, r29, r5 + add r30, r30, r6 + add r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + mr r8, r24 + mr r9, r25 + mr r10, r26 + mr r11, r27 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + mr r12, r28 + mr r13, r29 + mr r14, r30 + mr r15, r31 + + salsa8_core + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + add r8, r8, r24 + add r9, r9, r25 + add r10, r10, r26 + add r11, r11, r27 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + add r12, r12, r28 + add r13, r13, r29 + add r14, r14, r30 + add r15, r15, r31 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + + addi r4, r4, 32*4 + bdnz scrypt_core_loop1 + + lwz r5, 2*4(r1) + slwi r3, r5, 7 + subf r4, r3, r4 + mtctr r5 + addi r5, r5, -1 + stw r5, 2*4(r1) +scrypt_core_loop2: + and r3, r16, r5 + slwi r3, r3, 7 + add r3, r3, r4 + mr r0, r16 + mr r5, r17 + mr r6, r18 + mr r7, r19 + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + + lwz r0, 0*4(r3) + lwz r5, 1*4(r3) + lwz r6, 2*4(r3) + lwz r7, 3*4(r3) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 4*4(r3) + lwz r5, 5*4(r3) + lwz r6, 6*4(r3) + lwz r7, 7*4(r3) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + lwz r0, 8*4(r3) + lwz r5, 9*4(r3) + lwz r6, 10*4(r3) + lwz r7, 11*4(r3) + xor r24, r24, r0 + xor r25, r25, r5 + xor r26, r26, r6 + xor r27, r27, r7 + lwz r0, 12*4(r3) + lwz r5, 13*4(r3) + lwz r6, 14*4(r3) + lwz r7, 15*4(r3) + xor r28, r28, r0 + xor r29, r29, r5 + xor r30, r30, r6 + xor r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + salsa8_core + + lwz r0, 24*4(r1) + lwz r5, 25*4(r1) + lwz r6, 26*4(r1) + lwz r7, 27*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 28*4(r1) + lwz r5, 29*4(r1) + lwz r6, 30*4(r1) + lwz r7, 31*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r0, 32*4(r1) + lwz r5, 33*4(r1) + lwz r6, 34*4(r1) + lwz r7, 35*4(r1) + add r24, r24, r0 + add r25, r25, r5 + add r26, r26, r6 + add r27, r27, r7 + lwz r0, 36*4(r1) + lwz r5, 37*4(r1) + lwz r6, 38*4(r1) + lwz r7, 39*4(r1) + add r28, r28, r0 + add r29, r29, r5 + add r30, r30, r6 + add r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + lwz r0, 16*4(r3) + lwz r5, 17*4(r3) + lwz r6, 18*4(r3) + lwz r7, 19*4(r3) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 20*4(r3) + lwz r5, 21*4(r3) + lwz r6, 22*4(r3) + lwz r7, 23*4(r3) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + lwz r0, 24*4(r3) + lwz r5, 25*4(r3) + lwz r6, 26*4(r3) + lwz r7, 27*4(r3) + xor r24, r24, r0 + xor r25, r25, r5 + xor r26, r26, r6 + xor r27, r27, r7 + lwz r0, 28*4(r3) + lwz r5, 29*4(r3) + lwz r6, 30*4(r3) + lwz r7, 31*4(r3) + xor r28, r28, r0 + xor r29, r29, r5 + xor r30, r30, r6 + xor r31, r31, r7 + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + mr r8, r24 + mr r9, r25 + mr r10, r26 + mr r11, r27 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + mr r12, r28 + mr r13, r29 + mr r14, r30 + mr r15, r31 + + salsa8_core + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r5, 2*4(r1) + add r8, r8, r24 + add r9, r9, r25 + add r10, r10, r26 + add r11, r11, r27 + add r12, r12, r28 + add r13, r13, r29 + add r14, r14, r30 + add r15, r15, r31 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + bdnz scrypt_core_loop2 + + ld r3, 22*4(r1) + + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + stw r16, 0*4(r3) + stw r17, 1*4(r3) + stw r18, 2*4(r3) + stw r19, 3*4(r3) + stw r20, 4*4(r3) + stw r21, 5*4(r3) + stw r22, 6*4(r3) + stw r23, 7*4(r3) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + stw r24, 8*4(r3) + stw r25, 9*4(r3) + stw r26, 10*4(r3) + stw r27, 11*4(r3) + stw r28, 12*4(r3) + stw r29, 13*4(r3) + stw r30, 14*4(r3) + stw r31, 15*4(r3) + lwz r16, 40*4(r1) + lwz r17, 41*4(r1) + lwz r18, 42*4(r1) + lwz r19, 43*4(r1) + lwz r20, 44*4(r1) + lwz r21, 45*4(r1) + lwz r22, 46*4(r1) + lwz r23, 47*4(r1) + stw r16, 16*4(r3) + stw r17, 17*4(r3) + stw r18, 18*4(r3) + stw r19, 19*4(r3) + stw r20, 20*4(r3) + stw r21, 21*4(r3) + stw r22, 22*4(r3) + stw r23, 23*4(r3) + stw r8, 24*4(r3) + stw r9, 25*4(r3) + stw r10, 26*4(r3) + stw r11, 27*4(r3) + stw r12, 28*4(r3) + stw r13, 29*4(r3) + stw r14, 30*4(r3) + stw r15, 31*4(r3) + + ld r13, 4*4(r1) + ld r14, 6*4(r1) + ld r15, 8*4(r1) + ld r16, 10*4(r1) + ld r17, 12*4(r1) + ld r18, 14*4(r1) + ld r19, 16*4(r1) + ld r20, 18*4(r1) + ld r21, 20*4(r1) + ld r22, 48*4(r1) + ld r23, 50*4(r1) + ld r24, 52*4(r1) + ld r25, 54*4(r1) + ld r26, 56*4(r1) + ld r27, 58*4(r1) + ld r28, 60*4(r1) + ld r29, 62*4(r1) + ld r30, 64*4(r1) + ld r31, 66*4(r1) + addi r1, r1, 68*4 + blr + +#endif /* __ALTIVEC__ */ + +#endif diff --git a/scrypt-x64.S b/scrypt-x64.S index f9185d490..37ec5763d 100644 --- a/scrypt-x64.S +++ b/scrypt-x64.S @@ -112,43 +112,44 @@ scrypt_best_throughput_exit: ret -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %eax - movl \so+44(\src), %ebx - movl \so+28(\src), %ecx - movl \so+12(\src), %edx - movl %eax, \do+12(\dest) - movl %ebx, \do+28(\dest) - movl %ecx, \do+44(\dest) - movl %edx, \do+60(\dest) - movl \so+40(\src), %eax - movl \so+8(\src), %ebx - movl \so+48(\src), %ecx - movl \so+16(\src), %edx - movl %eax, \do+8(\dest) - movl %ebx, \do+40(\dest) - movl %ecx, \do+16(\dest) - movl %edx, \do+48(\dest) - movl \so+20(\src), %eax - movl \so+4(\src), %ebx - movl \so+52(\src), %ecx - movl \so+36(\src), %edx - movl %eax, \do+4(\dest) - movl %ebx, \do+20(\dest) - movl %ecx, \do+36(\dest) - movl %edx, \do+52(\dest) - movl \so+0(\src), %eax - movl \so+24(\src), %ebx - movl \so+32(\src), %ecx - movl \so+56(\src), %edx - movl %eax, \do+0(\dest) - movl %ebx, \do+24(\dest) - movl %ecx, \do+32(\dest) - movl %edx, \do+56(\dest) -.endm -.macro salsa8_core_gen_doubleround + + .text + .p2align 6 +salsa8_core_gen: + /* 0: %rdx, %rdi, %rcx, %rsi */ + movq 8(%rsp), %rdi + movq %rdi, %rdx + shrq $32, %rdi + movq 16(%rsp), %rsi + movq %rsi, %rcx + shrq $32, %rsi + /* 1: %r9, 72(%rsp), %rax, %r8 */ + movq 24(%rsp), %r8 + movq %r8, %r9 + shrq $32, %r8 + movq %r8, 72(%rsp) + movq 32(%rsp), %r8 + movq %r8, %rax + shrq $32, %r8 + /* 2: %r11, %r10, 48(%rsp), %r12 */ + movq 40(%rsp), %r10 + movq %r10, %r11 + shrq $32, %r10 + movq 48(%rsp), %r12 + /* movq %r12, %r13 */ + /* movq %r13, 48(%rsp) */ + shrq $32, %r12 + /* 3: %r14, %r13, %rbx, 88(%rsp) */ + movq 56(%rsp), %r13 + movq %r13, %r14 + shrq $32, %r13 + movq 64(%rsp), %r15 + movq %r15, %rbx + shrq $32, %r15 + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 leaq (%r14, %rdx), %rbp @@ -271,773 +272,596 @@ scrypt_best_throughput_exit: xorl %ebp, %r15d movq %r15, 88(%rsp) -.endm - - .text - .p2align 6 -salsa8_core_gen: - /* 0: %rdx, %rdi, %rcx, %rsi */ - movq 8(%rsp), %rdi - movq %rdi, %rdx - shrq $32, %rdi - movq 16(%rsp), %rsi - movq %rsi, %rcx - shrq $32, %rsi - /* 1: %r9, 72(%rsp), %rax, %r8 */ - movq 24(%rsp), %r8 - movq %r8, %r9 - shrq $32, %r8 - movq %r8, 72(%rsp) - movq 32(%rsp), %r8 - movq %r8, %rax - shrq $32, %r8 - /* 2: %r11, %r10, 48(%rsp), %r12 */ - movq 40(%rsp), %r10 - movq %r10, %r11 - shrq $32, %r10 - movq 48(%rsp), %r12 - /* movq %r12, %r13 */ - /* movq %r13, 48(%rsp) */ - shrq $32, %r12 - /* 3: %r14, %r13, %rbx, 88(%rsp) */ - movq 56(%rsp), %r13 - movq %r13, %r14 - shrq $32, %r13 - movq 64(%rsp), %r15 - movq %r15, %rbx - shrq $32, %r15 - movq %r15, 88(%rsp) + movq 72(%rsp), %r15 - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround + leaq (%r14, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %r9d + leaq (%rdi, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r10d + leaq (%rdx, %r9), %rbp + roll $9, %ebp + xorl %ebp, %r11d + leaq (%r15, %r10), %rbp + roll $9, %ebp + xorl %ebp, %r13d - shlq $32, %rdi - xorq %rdi, %rdx - movq %rdx, 24(%rsp) + leaq (%r9, %r11), %rbp + roll $13, %ebp + xorl %ebp, %r14d + leaq (%r10, %r13), %rbp + roll $13, %ebp + xorl %ebp, %edi + leaq (%r11, %r14), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r13, %rdi), %rbp + roll $18, %ebp + xorl %ebp, %r15d - shlq $32, %rsi - xorq %rsi, %rcx - movq %rcx, 32(%rsp) + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) - movl 72(%rsp), %edi - shlq $32, %rdi - xorq %rdi, %r9 - movq %r9, 40(%rsp) + leaq (%rax, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %ebx + leaq (%rbp, %rbx), %r15 + roll $9, %r15d + xorl %r15d, %ecx + leaq (%rbx, %rcx), %r15 + roll $13, %r15d + xorl %r15d, %eax + leaq (%rcx, %rax), %r15 + roll $18, %r15d + xorl %r15d, %ebp - movl 48(%rsp), %ebp - shlq $32, %r8 - xorq %r8, %rax - movq %rax, 48(%rsp) + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) - shlq $32, %r10 - xorq %r10, %r11 - movq %r11, 56(%rsp) + leaq (%r12, %r15), %rbp + roll $7, %ebp + xorl %ebp, %esi + leaq (%r15, %rsi), %rbp + roll $9, %ebp + xorl %ebp, %r8d + leaq (%rsi, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r12d + leaq (%r8, %r12), %rbp + roll $18, %ebp + xorl %ebp, %r15d - shlq $32, %r12 - xorq %r12, %rbp - movq %rbp, 64(%rsp) + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 - shlq $32, %r13 - xorq %r13, %r14 - movq %r14, 72(%rsp) + leaq (%rsi, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %edi + leaq (%r9, %r15), %rbp + roll $7, %ebp + xorl %ebp, %eax + leaq (%rdx, %rdi), %rbp + roll $9, %ebp + xorl %ebp, %ecx + leaq (%r15, %rax), %rbp + roll $9, %ebp + xorl %ebp, %r8d - movdqa 24(%rsp), %xmm0 + leaq (%rdi, %rcx), %rbp + roll $13, %ebp + xorl %ebp, %esi + leaq (%rax, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r9d + leaq (%rcx, %rsi), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r8, %r9), %rbp + roll $18, %ebp + xorl %ebp, %r15d - shlq $32, %r15 - xorq %r15, %rbx - movq %rbx, 80(%rsp) + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) - movdqa 40(%rsp), %xmm1 - movdqa 56(%rsp), %xmm2 - movdqa 72(%rsp), %xmm3 + leaq (%r10, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %r12d + leaq (%rbp, %r12), %r15 + roll $9, %r15d + xorl %r15d, %r11d + leaq (%r12, %r11), %r15 + roll $13, %r15d + xorl %r15d, %r10d + leaq (%r11, %r10), %r15 + roll $18, %r15d + xorl %r15d, %ebp - ret + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) + leaq (%rbx, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r14d + leaq (%r15, %r14), %rbp + roll $9, %ebp + xorl %ebp, %r13d + leaq (%r14, %r13), %rbp + roll $13, %ebp + xorl %ebp, %ebx + leaq (%r13, %rbx), %rbp + roll $18, %ebp + xorl %ebp, %r15d - .text - .p2align 6 - .globl scrypt_core - .globl _scrypt_core -scrypt_core: -_scrypt_core: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -#if defined(_WIN64) || defined(__CYGWIN__) - subq $176, %rsp - movdqa %xmm6, 8(%rsp) - movdqa %xmm7, 24(%rsp) - movdqa %xmm8, 40(%rsp) - movdqa %xmm9, 56(%rsp) - movdqa %xmm10, 72(%rsp) - movdqa %xmm11, 88(%rsp) - movdqa %xmm12, 104(%rsp) - movdqa %xmm13, 120(%rsp) - movdqa %xmm14, 136(%rsp) - movdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#else - movq %rdx, %r8 -#endif - -.macro scrypt_core_cleanup -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx -.endm + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 - /* GenuineIntel processors have fast SIMD */ - xorl %eax, %eax - cpuid - cmpl $0x6c65746e, %ecx - jne scrypt_core_gen - cmpl $0x49656e69, %edx - jne scrypt_core_gen - cmpl $0x756e6547, %ebx - je scrypt_core_xmm + leaq (%r14, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %r9d + leaq (%rdi, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r10d + leaq (%rdx, %r9), %rbp + roll $9, %ebp + xorl %ebp, %r11d + leaq (%r15, %r10), %rbp + roll $9, %ebp + xorl %ebp, %r13d - .p2align 6 -scrypt_core_gen: - subq $136, %rsp - movdqa 0(%rdi), %xmm8 - movdqa 16(%rdi), %xmm9 - movdqa 32(%rdi), %xmm10 - movdqa 48(%rdi), %xmm11 - movdqa 64(%rdi), %xmm12 - movdqa 80(%rdi), %xmm13 - movdqa 96(%rdi), %xmm14 - movdqa 112(%rdi), %xmm15 + leaq (%r9, %r11), %rbp + roll $13, %ebp + xorl %ebp, %r14d + leaq (%r10, %r13), %rbp + roll $13, %ebp + xorl %ebp, %edi + leaq (%r11, %r14), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r13, %rdi), %rbp + roll $18, %ebp + xorl %ebp, %r15d - movq %r8, %rcx - shlq $7, %rcx - addq %rsi, %rcx - movq %r8, 96(%rsp) - movq %rdi, 104(%rsp) - movq %rsi, 112(%rsp) - movq %rcx, 120(%rsp) -scrypt_core_gen_loop1: - movdqa %xmm8, 0(%rsi) - movdqa %xmm9, 16(%rsi) - movdqa %xmm10, 32(%rsi) - movdqa %xmm11, 48(%rsi) - movdqa %xmm12, 64(%rsi) - movdqa %xmm13, 80(%rsi) - movdqa %xmm14, 96(%rsi) - movdqa %xmm15, 112(%rsi) + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rsi, 128(%rsp) - call salsa8_core_gen - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 + leaq (%rax, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %ebx + leaq (%rbp, %rbx), %r15 + roll $9, %r15d + xorl %r15d, %ecx + leaq (%rbx, %rcx), %r15 + roll $13, %r15d + xorl %r15d, %eax + leaq (%rcx, %rax), %r15 + roll $18, %r15d + xorl %r15d, %ebp - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, 0(%rsp) - movdqa %xmm13, 16(%rsp) - movdqa %xmm14, 32(%rsp) - movdqa %xmm15, 48(%rsp) - call salsa8_core_gen - movq 128(%rsp), %rsi - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) - addq $128, %rsi - movq 120(%rsp), %rcx - cmpq %rcx, %rsi - jne scrypt_core_gen_loop1 + leaq (%r12, %r15), %rbp + roll $7, %ebp + xorl %ebp, %esi + leaq (%r15, %rsi), %rbp + roll $9, %ebp + xorl %ebp, %r8d + leaq (%rsi, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r12d + leaq (%r8, %r12), %rbp + roll $18, %ebp + xorl %ebp, %r15d - movq 96(%rsp), %r8 - movq %r8, %rcx - subl $1, %r8d - movq %r8, 96(%rsp) - movd %xmm12, %edx -scrypt_core_gen_loop2: - movq 112(%rsp), %rsi - andl %r8d, %edx - shll $7, %edx - addq %rsi, %rdx - movdqa 0(%rdx), %xmm0 - movdqa 16(%rdx), %xmm1 - movdqa 32(%rdx), %xmm2 - movdqa 48(%rdx), %xmm3 - movdqa 64(%rdx), %xmm4 - movdqa 80(%rdx), %xmm5 - movdqa 96(%rdx), %xmm6 - movdqa 112(%rdx), %xmm7 - pxor %xmm0, %xmm8 - pxor %xmm1, %xmm9 - pxor %xmm2, %xmm10 - pxor %xmm3, %xmm11 - pxor %xmm4, %xmm12 - pxor %xmm5, %xmm13 - pxor %xmm6, %xmm14 - pxor %xmm7, %xmm15 + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rcx, 128(%rsp) - call salsa8_core_gen - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 + leaq (%rsi, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %edi + leaq (%r9, %r15), %rbp + roll $7, %ebp + xorl %ebp, %eax + leaq (%rdx, %rdi), %rbp + roll $9, %ebp + xorl %ebp, %ecx + leaq (%r15, %rax), %rbp + roll $9, %ebp + xorl %ebp, %r8d - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, 0(%rsp) - movdqa %xmm13, 16(%rsp) - movdqa %xmm14, 32(%rsp) - movdqa %xmm15, 48(%rsp) - call salsa8_core_gen - movq 96(%rsp), %r8 - movq 128(%rsp), %rcx - addl 0(%rsp), %edx - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 + leaq (%rdi, %rcx), %rbp + roll $13, %ebp + xorl %ebp, %esi + leaq (%rax, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r9d + leaq (%rcx, %rsi), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r8, %r9), %rbp + roll $18, %ebp + xorl %ebp, %r15d - subq $1, %rcx - ja scrypt_core_gen_loop2 + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) - movq 104(%rsp), %rdi - movdqa %xmm8, 0(%rdi) - movdqa %xmm9, 16(%rdi) - movdqa %xmm10, 32(%rdi) - movdqa %xmm11, 48(%rdi) - movdqa %xmm12, 64(%rdi) - movdqa %xmm13, 80(%rdi) - movdqa %xmm14, 96(%rdi) - movdqa %xmm15, 112(%rdi) + leaq (%r10, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %r12d + leaq (%rbp, %r12), %r15 + roll $9, %r15d + xorl %r15d, %r11d + leaq (%r12, %r11), %r15 + roll $13, %r15d + xorl %r15d, %r10d + leaq (%r11, %r10), %r15 + roll $18, %r15d + xorl %r15d, %ebp - addq $136, %rsp - scrypt_core_cleanup - ret - - -.macro salsa8_core_xmm_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 + leaq (%rbx, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r14d + leaq (%r15, %r14), %rbp + roll $9, %ebp + xorl %ebp, %r13d + leaq (%r14, %r13), %rbp + roll $13, %ebp + xorl %ebp, %ebx + leaq (%r13, %rbx), %rbp + roll $18, %ebp + xorl %ebp, %r15d - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 + leaq (%r14, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %r9d + leaq (%rdi, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r10d + leaq (%rdx, %r9), %rbp + roll $9, %ebp + xorl %ebp, %r11d + leaq (%r15, %r10), %rbp + roll $9, %ebp + xorl %ebp, %r13d - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 + leaq (%r9, %r11), %rbp + roll $13, %ebp + xorl %ebp, %r14d + leaq (%r10, %r13), %rbp + roll $13, %ebp + xorl %ebp, %edi + leaq (%r11, %r14), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r13, %rdi), %rbp + roll $18, %ebp + xorl %ebp, %r15d - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 + leaq (%rax, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %ebx + leaq (%rbp, %rbx), %r15 + roll $9, %r15d + xorl %r15d, %ecx + leaq (%rbx, %rcx), %r15 + roll $13, %r15d + xorl %r15d, %eax + leaq (%rcx, %rax), %r15 + roll $18, %r15d + xorl %r15d, %ebp - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm - -.macro salsa8_core_xmm - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround -.endm + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) - .p2align 6 -scrypt_core_xmm: - pcmpeqw %xmm1, %xmm1 - psrlq $32, %xmm1 + leaq (%r12, %r15), %rbp + roll $7, %ebp + xorl %ebp, %esi + leaq (%r15, %rsi), %rbp + roll $9, %ebp + xorl %ebp, %r8d + leaq (%rsi, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r12d + leaq (%r8, %r12), %rbp + roll $18, %ebp + xorl %ebp, %r15d - movdqa 0(%rdi), %xmm8 - movdqa 16(%rdi), %xmm11 - movdqa 32(%rdi), %xmm10 - movdqa 48(%rdi), %xmm9 - movdqa %xmm8, %xmm0 - pxor %xmm11, %xmm8 - pand %xmm1, %xmm8 - pxor %xmm11, %xmm8 - pxor %xmm10, %xmm11 - pand %xmm1, %xmm11 - pxor %xmm10, %xmm11 - pxor %xmm9, %xmm10 - pand %xmm1, %xmm10 - pxor %xmm9, %xmm10 - pxor %xmm0, %xmm9 - pand %xmm1, %xmm9 - pxor %xmm0, %xmm9 - movdqa %xmm8, %xmm0 - pshufd $0x4e, %xmm10, %xmm10 - punpcklqdq %xmm10, %xmm8 - punpckhqdq %xmm0, %xmm10 - movdqa %xmm11, %xmm0 - pshufd $0x4e, %xmm9, %xmm9 - punpcklqdq %xmm9, %xmm11 - punpckhqdq %xmm0, %xmm9 + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 - movdqa 64(%rdi), %xmm12 - movdqa 80(%rdi), %xmm15 - movdqa 96(%rdi), %xmm14 - movdqa 112(%rdi), %xmm13 - movdqa %xmm12, %xmm0 - pxor %xmm15, %xmm12 - pand %xmm1, %xmm12 - pxor %xmm15, %xmm12 - pxor %xmm14, %xmm15 - pand %xmm1, %xmm15 - pxor %xmm14, %xmm15 - pxor %xmm13, %xmm14 - pand %xmm1, %xmm14 - pxor %xmm13, %xmm14 - pxor %xmm0, %xmm13 - pand %xmm1, %xmm13 - pxor %xmm0, %xmm13 - movdqa %xmm12, %xmm0 - pshufd $0x4e, %xmm14, %xmm14 - punpcklqdq %xmm14, %xmm12 - punpckhqdq %xmm0, %xmm14 - movdqa %xmm15, %xmm0 - pshufd $0x4e, %xmm13, %xmm13 - punpcklqdq %xmm13, %xmm15 - punpckhqdq %xmm0, %xmm13 + leaq (%rsi, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %edi + leaq (%r9, %r15), %rbp + roll $7, %ebp + xorl %ebp, %eax + leaq (%rdx, %rdi), %rbp + roll $9, %ebp + xorl %ebp, %ecx + leaq (%r15, %rax), %rbp + roll $9, %ebp + xorl %ebp, %r8d - movq %rsi, %rdx - movq %r8, %rcx - shlq $7, %rcx - addq %rsi, %rcx -scrypt_core_xmm_loop1: - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rdx) - movdqa %xmm9, 16(%rdx) - movdqa %xmm10, 32(%rdx) - movdqa %xmm11, 48(%rdx) - movdqa %xmm12, 64(%rdx) - movdqa %xmm13, 80(%rdx) - movdqa %xmm14, 96(%rdx) - movdqa %xmm15, 112(%rdx) + leaq (%rdi, %rcx), %rbp + roll $13, %ebp + xorl %ebp, %esi + leaq (%rax, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r9d + leaq (%rcx, %rsi), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r8, %r9), %rbp + roll $18, %ebp + xorl %ebp, %r15d - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, %xmm0 - movdqa %xmm13, %xmm1 - movdqa %xmm14, %xmm2 - movdqa %xmm15, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 + leaq (%r10, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %r12d + leaq (%rbp, %r12), %r15 + roll $9, %r15d + xorl %r15d, %r11d + leaq (%r12, %r11), %r15 + roll $13, %r15d + xorl %r15d, %r10d + leaq (%r11, %r10), %r15 + roll $18, %r15d + xorl %r15d, %ebp - addq $128, %rdx - cmpq %rcx, %rdx - jne scrypt_core_xmm_loop1 + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) - movq %r8, %rcx - subl $1, %r8d -scrypt_core_xmm_loop2: - movd %xmm12, %edx - andl %r8d, %edx - shll $7, %edx - pxor 0(%rsi, %rdx), %xmm8 - pxor 16(%rsi, %rdx), %xmm9 - pxor 32(%rsi, %rdx), %xmm10 - pxor 48(%rsi, %rdx), %xmm11 + leaq (%rbx, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r14d + leaq (%r15, %r14), %rbp + roll $9, %ebp + xorl %ebp, %r13d + leaq (%r14, %r13), %rbp + roll $13, %ebp + xorl %ebp, %ebx + leaq (%r13, %rbx), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq %r15, 88(%rsp) + + shlq $32, %rdi + xorq %rdi, %rdx + movq %rdx, 24(%rsp) + + shlq $32, %rsi + xorq %rsi, %rcx + movq %rcx, 32(%rsp) + + movl 72(%rsp), %edi + shlq $32, %rdi + xorq %rdi, %r9 + movq %r9, 40(%rsp) + + movl 48(%rsp), %ebp + shlq $32, %r8 + xorq %r8, %rax + movq %rax, 48(%rsp) + + shlq $32, %r10 + xorq %r10, %r11 + movq %r11, 56(%rsp) + + shlq $32, %r12 + xorq %r12, %rbp + movq %rbp, 64(%rsp) + + shlq $32, %r13 + xorq %r13, %r14 + movq %r14, 72(%rsp) + + movdqa 24(%rsp), %xmm0 + + shlq $32, %r15 + xorq %r15, %rbx + movq %rbx, 80(%rsp) + + movdqa 40(%rsp), %xmm1 + movdqa 56(%rsp), %xmm2 + movdqa 72(%rsp), %xmm3 + + ret + + + .text + .p2align 6 + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#else + movq %rdx, %r8 +#endif + + + /* GenuineIntel processors have fast SIMD */ + xorl %eax, %eax + cpuid + cmpl $0x6c65746e, %ecx + jne scrypt_core_gen + cmpl $0x49656e69, %edx + jne scrypt_core_gen + cmpl $0x756e6547, %ebx + je scrypt_core_xmm + + .p2align 6 +scrypt_core_gen: + subq $136, %rsp + movdqa 0(%rdi), %xmm8 + movdqa 16(%rdi), %xmm9 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm11 + movdqa 64(%rdi), %xmm12 + movdqa 80(%rdi), %xmm13 + movdqa 96(%rdi), %xmm14 + movdqa 112(%rdi), %xmm15 + + movq %r8, %rcx + shlq $7, %rcx + addq %rsi, %rcx + movq %r8, 96(%rsp) + movq %rdi, 104(%rsp) + movq %rsi, 112(%rsp) + movq %rcx, 120(%rsp) +scrypt_core_gen_loop1: + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + movdqa %xmm12, 64(%rsi) + movdqa %xmm13, 80(%rsi) + movdqa %xmm14, 96(%rsi) + movdqa %xmm15, 112(%rsi) pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - salsa8_core_xmm + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rsi, 128(%rsp) + call salsa8_core_gen paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 paddd %xmm3, %xmm11 - pxor 64(%rsi, %rdx), %xmm12 - pxor 80(%rsi, %rdx), %xmm13 - pxor 96(%rsi, %rdx), %xmm14 - pxor 112(%rsi, %rdx), %xmm15 pxor %xmm8, %xmm12 pxor %xmm9, %xmm13 pxor %xmm10, %xmm14 pxor %xmm11, %xmm15 - movdqa %xmm12, %xmm0 - movdqa %xmm13, %xmm1 - movdqa %xmm14, %xmm2 - movdqa %xmm15, %xmm3 - salsa8_core_xmm + movdqa %xmm12, 0(%rsp) + movdqa %xmm13, 16(%rsp) + movdqa %xmm14, 32(%rsp) + movdqa %xmm15, 48(%rsp) + call salsa8_core_gen + movq 128(%rsp), %rsi paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 - subq $1, %rcx - ja scrypt_core_xmm_loop2 + addq $128, %rsi + movq 120(%rsp), %rcx + cmpq %rcx, %rsi + jne scrypt_core_gen_loop1 - pcmpeqw %xmm1, %xmm1 - psrlq $32, %xmm1 + movq 96(%rsp), %r8 + movq %r8, %rcx + subl $1, %r8d + movq %r8, 96(%rsp) + movd %xmm12, %edx +scrypt_core_gen_loop2: + movq 112(%rsp), %rsi + andl %r8d, %edx + shll $7, %edx + addq %rsi, %rdx + movdqa 0(%rdx), %xmm0 + movdqa 16(%rdx), %xmm1 + movdqa 32(%rdx), %xmm2 + movdqa 48(%rdx), %xmm3 + movdqa 64(%rdx), %xmm4 + movdqa 80(%rdx), %xmm5 + movdqa 96(%rdx), %xmm6 + movdqa 112(%rdx), %xmm7 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + pxor %xmm4, %xmm12 + pxor %xmm5, %xmm13 + pxor %xmm6, %xmm14 + pxor %xmm7, %xmm15 - movdqa %xmm8, %xmm0 - pxor %xmm9, %xmm8 - pand %xmm1, %xmm8 - pxor %xmm9, %xmm8 - pxor %xmm10, %xmm9 - pand %xmm1, %xmm9 - pxor %xmm10, %xmm9 - pxor %xmm11, %xmm10 - pand %xmm1, %xmm10 - pxor %xmm11, %xmm10 - pxor %xmm0, %xmm11 - pand %xmm1, %xmm11 - pxor %xmm0, %xmm11 - movdqa %xmm8, %xmm0 - pshufd $0x4e, %xmm10, %xmm10 - punpcklqdq %xmm10, %xmm8 - punpckhqdq %xmm0, %xmm10 - movdqa %xmm9, %xmm0 - pshufd $0x4e, %xmm11, %xmm11 - punpcklqdq %xmm11, %xmm9 - punpckhqdq %xmm0, %xmm11 + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rcx, 128(%rsp) + call salsa8_core_gen + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, 0(%rsp) + movdqa %xmm13, 16(%rsp) + movdqa %xmm14, 32(%rsp) + movdqa %xmm15, 48(%rsp) + call salsa8_core_gen + movq 96(%rsp), %r8 + movq 128(%rsp), %rcx + addl 0(%rsp), %edx + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + subq $1, %rcx + ja scrypt_core_gen_loop2 + + movq 104(%rsp), %rdi movdqa %xmm8, 0(%rdi) - movdqa %xmm11, 16(%rdi) + movdqa %xmm9, 16(%rdi) movdqa %xmm10, 32(%rdi) - movdqa %xmm9, 48(%rdi) + movdqa %xmm11, 48(%rdi) + movdqa %xmm12, 64(%rdi) + movdqa %xmm13, 80(%rdi) + movdqa %xmm14, 96(%rdi) + movdqa %xmm15, 112(%rdi) - movdqa %xmm12, %xmm0 - pxor %xmm13, %xmm12 - pand %xmm1, %xmm12 - pxor %xmm13, %xmm12 - pxor %xmm14, %xmm13 - pand %xmm1, %xmm13 - pxor %xmm14, %xmm13 - pxor %xmm15, %xmm14 - pand %xmm1, %xmm14 - pxor %xmm15, %xmm14 - pxor %xmm0, %xmm15 - pand %xmm1, %xmm15 - pxor %xmm0, %xmm15 - movdqa %xmm12, %xmm0 - pshufd $0x4e, %xmm14, %xmm14 - punpcklqdq %xmm14, %xmm12 - punpckhqdq %xmm0, %xmm14 - movdqa %xmm13, %xmm0 - pshufd $0x4e, %xmm15, %xmm15 - punpcklqdq %xmm15, %xmm13 - punpckhqdq %xmm0, %xmm15 - movdqa %xmm12, 64(%rdi) - movdqa %xmm15, 80(%rdi) - movdqa %xmm14, 96(%rdi) - movdqa %xmm13, 112(%rdi) - - scrypt_core_cleanup - ret - - -#if defined(USE_AVX) -.macro salsa8_core_3way_avx_doubleround - vpaddd %xmm0, %xmm1, %xmm4 - vpaddd %xmm8, %xmm9, %xmm6 - vpaddd %xmm12, %xmm13, %xmm7 - vpslld $7, %xmm4, %xmm5 - vpsrld $25, %xmm4, %xmm4 - vpxor %xmm5, %xmm3, %xmm3 - vpxor %xmm4, %xmm3, %xmm3 - vpslld $7, %xmm6, %xmm5 - vpsrld $25, %xmm6, %xmm6 - vpxor %xmm5, %xmm11, %xmm11 - vpxor %xmm6, %xmm11, %xmm11 - vpslld $7, %xmm7, %xmm5 - vpsrld $25, %xmm7, %xmm7 - vpxor %xmm5, %xmm15, %xmm15 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm0, %xmm4 - vpaddd %xmm11, %xmm8, %xmm6 - vpaddd %xmm15, %xmm12, %xmm7 - vpslld $9, %xmm4, %xmm5 - vpsrld $23, %xmm4, %xmm4 - vpxor %xmm5, %xmm2, %xmm2 - vpxor %xmm4, %xmm2, %xmm2 - vpslld $9, %xmm6, %xmm5 - vpsrld $23, %xmm6, %xmm6 - vpxor %xmm5, %xmm10, %xmm10 - vpxor %xmm6, %xmm10, %xmm10 - vpslld $9, %xmm7, %xmm5 - vpsrld $23, %xmm7, %xmm7 - vpxor %xmm5, %xmm14, %xmm14 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm3, %xmm4 - vpaddd %xmm10, %xmm11, %xmm6 - vpaddd %xmm14, %xmm15, %xmm7 - vpslld $13, %xmm4, %xmm5 - vpsrld $19, %xmm4, %xmm4 - vpshufd $0x93, %xmm3, %xmm3 - vpshufd $0x93, %xmm11, %xmm11 - vpshufd $0x93, %xmm15, %xmm15 - vpxor %xmm5, %xmm1, %xmm1 - vpxor %xmm4, %xmm1, %xmm1 - vpslld $13, %xmm6, %xmm5 - vpsrld $19, %xmm6, %xmm6 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm9, %xmm9 - vpslld $13, %xmm7, %xmm5 - vpsrld $19, %xmm7, %xmm7 - vpxor %xmm5, %xmm13, %xmm13 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm2, %xmm4 - vpaddd %xmm9, %xmm10, %xmm6 - vpaddd %xmm13, %xmm14, %xmm7 - vpslld $18, %xmm4, %xmm5 - vpsrld $14, %xmm4, %xmm4 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm5, %xmm0, %xmm0 - vpxor %xmm4, %xmm0, %xmm0 - vpslld $18, %xmm6, %xmm5 - vpsrld $14, %xmm6, %xmm6 - vpxor %xmm5, %xmm8, %xmm8 - vpxor %xmm6, %xmm8, %xmm8 - vpslld $18, %xmm7, %xmm5 - vpsrld $14, %xmm7, %xmm7 - vpxor %xmm5, %xmm12, %xmm12 - vpxor %xmm7, %xmm12, %xmm12 - - vpaddd %xmm0, %xmm3, %xmm4 - vpaddd %xmm8, %xmm11, %xmm6 - vpaddd %xmm12, %xmm15, %xmm7 - vpslld $7, %xmm4, %xmm5 - vpsrld $25, %xmm4, %xmm4 - vpshufd $0x39, %xmm1, %xmm1 - vpxor %xmm5, %xmm1, %xmm1 - vpxor %xmm4, %xmm1, %xmm1 - vpslld $7, %xmm6, %xmm5 - vpsrld $25, %xmm6, %xmm6 - vpshufd $0x39, %xmm9, %xmm9 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm9, %xmm9 - vpslld $7, %xmm7, %xmm5 - vpsrld $25, %xmm7, %xmm7 - vpshufd $0x39, %xmm13, %xmm13 - vpxor %xmm5, %xmm13, %xmm13 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm0, %xmm4 - vpaddd %xmm9, %xmm8, %xmm6 - vpaddd %xmm13, %xmm12, %xmm7 - vpslld $9, %xmm4, %xmm5 - vpsrld $23, %xmm4, %xmm4 - vpxor %xmm5, %xmm2, %xmm2 - vpxor %xmm4, %xmm2, %xmm2 - vpslld $9, %xmm6, %xmm5 - vpsrld $23, %xmm6, %xmm6 - vpxor %xmm5, %xmm10, %xmm10 - vpxor %xmm6, %xmm10, %xmm10 - vpslld $9, %xmm7, %xmm5 - vpsrld $23, %xmm7, %xmm7 - vpxor %xmm5, %xmm14, %xmm14 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm1, %xmm4 - vpaddd %xmm10, %xmm9, %xmm6 - vpaddd %xmm14, %xmm13, %xmm7 - vpslld $13, %xmm4, %xmm5 - vpsrld $19, %xmm4, %xmm4 - vpshufd $0x93, %xmm1, %xmm1 - vpshufd $0x93, %xmm9, %xmm9 - vpshufd $0x93, %xmm13, %xmm13 - vpxor %xmm5, %xmm3, %xmm3 - vpxor %xmm4, %xmm3, %xmm3 - vpslld $13, %xmm6, %xmm5 - vpsrld $19, %xmm6, %xmm6 - vpxor %xmm5, %xmm11, %xmm11 - vpxor %xmm6, %xmm11, %xmm11 - vpslld $13, %xmm7, %xmm5 - vpsrld $19, %xmm7, %xmm7 - vpxor %xmm5, %xmm15, %xmm15 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm2, %xmm4 - vpaddd %xmm11, %xmm10, %xmm6 - vpaddd %xmm15, %xmm14, %xmm7 - vpslld $18, %xmm4, %xmm5 - vpsrld $14, %xmm4, %xmm4 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpxor %xmm5, %xmm0, %xmm0 - vpxor %xmm4, %xmm0, %xmm0 - vpslld $18, %xmm6, %xmm5 - vpsrld $14, %xmm6, %xmm6 - vpshufd $0x4e, %xmm14, %xmm14 - vpshufd $0x39, %xmm11, %xmm11 - vpxor %xmm5, %xmm8, %xmm8 - vpxor %xmm6, %xmm8, %xmm8 - vpslld $18, %xmm7, %xmm5 - vpsrld $14, %xmm7, %xmm7 - vpshufd $0x39, %xmm3, %xmm3 - vpshufd $0x39, %xmm15, %xmm15 - vpxor %xmm5, %xmm12, %xmm12 - vpxor %xmm7, %xmm12, %xmm12 -.endm - -.macro salsa8_core_3way_avx - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround -.endm -#endif /* USE_AVX */ - - .text - .p2align 6 - .globl scrypt_core_3way - .globl _scrypt_core_3way -scrypt_core_3way: -_scrypt_core_3way: - pushq %rbx - pushq %rbp -#if defined(_WIN64) || defined(__CYGWIN__) - subq $176, %rsp - movdqa %xmm6, 8(%rsp) - movdqa %xmm7, 24(%rsp) - movdqa %xmm8, 40(%rsp) - movdqa %xmm9, 56(%rsp) - movdqa %xmm10, 72(%rsp) - movdqa %xmm11, 88(%rsp) - movdqa %xmm12, 104(%rsp) - movdqa %xmm13, 120(%rsp) - movdqa %xmm14, 136(%rsp) - movdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#else - movq %rdx, %r8 -#endif - subq $392, %rsp - -.macro scrypt_core_3way_cleanup - addq $392, %rsp + addq $136, %rsp #if defined(_WIN64) || defined(__CYGWIN__) popq %rsi popq %rdi @@ -1053,1090 +877,8523 @@ _scrypt_core_3way: movdqa 152(%rsp), %xmm15 addq $176, %rsp #endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 popq %rbp popq %rbx -.endm - -#if !defined(USE_AVX) - jmp scrypt_core_3way_xmm -#else - /* Check for AVX and OSXSAVE support */ - movl $1, %eax - cpuid - andl $0x18000000, %ecx - cmpl $0x18000000, %ecx - jne scrypt_core_3way_xmm - /* Check for XMM and YMM state support */ - xorl %ecx, %ecx - xgetbv - andl $0x00000006, %eax - cmpl $0x00000006, %eax - jne scrypt_core_3way_xmm -#if defined(USE_XOP) - /* Check for XOP support */ - movl $0x80000001, %eax - cpuid - andl $0x00000800, %ecx - jnz scrypt_core_3way_xop -#endif + ret + + + -scrypt_core_3way_avx: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 + .p2align 6 +scrypt_core_xmm: + pcmpeqw %xmm1, %xmm1 + psrlq $32, %xmm1 - movdqa 64(%rsp), %xmm0 - movdqa 80(%rsp), %xmm1 - movdqa 96(%rsp), %xmm2 - movdqa 112(%rsp), %xmm3 - movdqa 128+64(%rsp), %xmm8 - movdqa 128+80(%rsp), %xmm9 - movdqa 128+96(%rsp), %xmm10 - movdqa 128+112(%rsp), %xmm11 - movdqa 256+64(%rsp), %xmm12 - movdqa 256+80(%rsp), %xmm13 - movdqa 256+96(%rsp), %xmm14 - movdqa 256+112(%rsp), %xmm15 - - movq %rsi, %rbx - leaq (%r8, %r8, 2), %rax - shlq $7, %rax - addq %rsi, %rax -scrypt_core_3way_avx_loop1: - movdqa %xmm0, 64(%rbx) - movdqa %xmm1, 80(%rbx) - movdqa %xmm2, 96(%rbx) - movdqa %xmm3, 112(%rbx) - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - movdqa %xmm8, 128+64(%rbx) - movdqa %xmm9, 128+80(%rbx) - movdqa %xmm10, 128+96(%rbx) - movdqa %xmm11, 128+112(%rbx) - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - movdqa %xmm12, 256+64(%rbx) - movdqa %xmm13, 256+80(%rbx) - movdqa %xmm14, 256+96(%rbx) - movdqa %xmm15, 256+112(%rbx) - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rbx) - movdqa %xmm1, 16(%rbx) - movdqa %xmm2, 32(%rbx) - movdqa %xmm3, 48(%rbx) - movdqa %xmm8, 128+0(%rbx) - movdqa %xmm9, 128+16(%rbx) - movdqa %xmm10, 128+32(%rbx) - movdqa %xmm11, 128+48(%rbx) - movdqa %xmm12, 256+0(%rbx) - movdqa %xmm13, 256+16(%rbx) - movdqa %xmm14, 256+32(%rbx) - movdqa %xmm15, 256+48(%rbx) + movdqa 0(%rdi), %xmm8 + movdqa 16(%rdi), %xmm11 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm9 + movdqa %xmm8, %xmm0 + pxor %xmm11, %xmm8 + pand %xmm1, %xmm8 + pxor %xmm11, %xmm8 + pxor %xmm10, %xmm11 + pand %xmm1, %xmm11 + pxor %xmm10, %xmm11 + pxor %xmm9, %xmm10 + pand %xmm1, %xmm10 + pxor %xmm9, %xmm10 + pxor %xmm0, %xmm9 + pand %xmm1, %xmm9 + pxor %xmm0, %xmm9 + movdqa %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm10 + punpcklqdq %xmm10, %xmm8 + punpckhqdq %xmm0, %xmm10 + movdqa %xmm11, %xmm0 + pshufd $0x4e, %xmm9, %xmm9 + punpcklqdq %xmm9, %xmm11 + punpckhqdq %xmm0, %xmm9 - salsa8_core_3way_avx - paddd 0(%rbx), %xmm0 - paddd 16(%rbx), %xmm1 - paddd 32(%rbx), %xmm2 - paddd 48(%rbx), %xmm3 - paddd 128+0(%rbx), %xmm8 - paddd 128+16(%rbx), %xmm9 - paddd 128+32(%rbx), %xmm10 - paddd 128+48(%rbx), %xmm11 - paddd 256+0(%rbx), %xmm12 - paddd 256+16(%rbx), %xmm13 - paddd 256+32(%rbx), %xmm14 - paddd 256+48(%rbx), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) + movdqa 64(%rdi), %xmm12 + movdqa 80(%rdi), %xmm15 + movdqa 96(%rdi), %xmm14 + movdqa 112(%rdi), %xmm13 + movdqa %xmm12, %xmm0 + pxor %xmm15, %xmm12 + pand %xmm1, %xmm12 + pxor %xmm15, %xmm12 + pxor %xmm14, %xmm15 + pand %xmm1, %xmm15 + pxor %xmm14, %xmm15 + pxor %xmm13, %xmm14 + pand %xmm1, %xmm14 + pxor %xmm13, %xmm14 + pxor %xmm0, %xmm13 + pand %xmm1, %xmm13 + pxor %xmm0, %xmm13 + movdqa %xmm12, %xmm0 + pshufd $0x4e, %xmm14, %xmm14 + punpcklqdq %xmm14, %xmm12 + punpckhqdq %xmm0, %xmm14 + movdqa %xmm15, %xmm0 + pshufd $0x4e, %xmm13, %xmm13 + punpcklqdq %xmm13, %xmm15 + punpckhqdq %xmm0, %xmm13 - pxor 64(%rbx), %xmm0 - pxor 80(%rbx), %xmm1 - pxor 96(%rbx), %xmm2 - pxor 112(%rbx), %xmm3 - pxor 128+64(%rbx), %xmm8 - pxor 128+80(%rbx), %xmm9 - pxor 128+96(%rbx), %xmm10 - pxor 128+112(%rbx), %xmm11 - pxor 256+64(%rbx), %xmm12 - pxor 256+80(%rbx), %xmm13 - pxor 256+96(%rbx), %xmm14 - pxor 256+112(%rbx), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_avx - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 + movq %rsi, %rdx + movq %r8, %rcx + shlq $7, %rcx + addq %rsi, %rcx +scrypt_core_xmm_loop1: + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rdx) + movdqa %xmm9, 16(%rdx) + movdqa %xmm10, 32(%rdx) + movdqa %xmm11, 48(%rdx) + movdqa %xmm12, 64(%rdx) + movdqa %xmm13, 80(%rdx) + movdqa %xmm14, 96(%rdx) + movdqa %xmm15, 112(%rdx) - addq $3*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_3way_avx_loop1 + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 - movq %r8, %rcx - subq $1, %r8 -scrypt_core_3way_avx_loop2: - movd %xmm0, %ebp - movd %xmm8, %ebx - movd %xmm12, %eax - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - andl %r8d, %ebp - leaq (%rbp, %rbp, 2), %rbp - shll $7, %ebp - andl %r8d, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $7, %ebx - andl %r8d, %eax - leaq 2(%rax, %rax, 2), %rax - shll $7, %eax - pxor 0(%rsi, %rbp), %xmm0 - pxor 16(%rsi, %rbp), %xmm1 - pxor 32(%rsi, %rbp), %xmm2 - pxor 48(%rsi, %rbp), %xmm3 - pxor 0(%rsi, %rbx), %xmm8 - pxor 16(%rsi, %rbx), %xmm9 - pxor 32(%rsi, %rbx), %xmm10 - pxor 48(%rsi, %rbx), %xmm11 - pxor 0(%rsi, %rax), %xmm12 - pxor 16(%rsi, %rax), %xmm13 - pxor 32(%rsi, %rax), %xmm14 - pxor 48(%rsi, %rax), %xmm15 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_avx - paddd 0(%rsp), %xmm0 - paddd 16(%rsp), %xmm1 - paddd 32(%rsp), %xmm2 - paddd 48(%rsp), %xmm3 - paddd 128+0(%rsp), %xmm8 - paddd 128+16(%rsp), %xmm9 - paddd 128+32(%rsp), %xmm10 - paddd 128+48(%rsp), %xmm11 - paddd 256+0(%rsp), %xmm12 - paddd 256+16(%rsp), %xmm13 - paddd 256+32(%rsp), %xmm14 - paddd 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 - pxor 64(%rsi, %rbp), %xmm0 - pxor 80(%rsi, %rbp), %xmm1 - pxor 96(%rsi, %rbp), %xmm2 - pxor 112(%rsi, %rbp), %xmm3 - pxor 64(%rsi, %rbx), %xmm8 - pxor 80(%rsi, %rbx), %xmm9 - pxor 96(%rsi, %rbx), %xmm10 - pxor 112(%rsi, %rbx), %xmm11 - pxor 64(%rsi, %rax), %xmm12 - pxor 80(%rsi, %rax), %xmm13 - pxor 96(%rsi, %rax), %xmm14 - pxor 112(%rsi, %rax), %xmm15 - pxor 64(%rsp), %xmm0 - pxor 80(%rsp), %xmm1 - pxor 96(%rsp), %xmm2 - pxor 112(%rsp), %xmm3 - pxor 128+64(%rsp), %xmm8 - pxor 128+80(%rsp), %xmm9 - pxor 128+96(%rsp), %xmm10 - pxor 128+112(%rsp), %xmm11 - pxor 256+64(%rsp), %xmm12 - pxor 256+80(%rsp), %xmm13 - pxor 256+96(%rsp), %xmm14 - pxor 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_avx - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 - subq $1, %rcx - ja scrypt_core_3way_avx_loop2 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 - scrypt_core_3way_cleanup - ret - -#if defined(USE_XOP) -.macro salsa8_core_3way_xop_doubleround - vpaddd %xmm0, %xmm1, %xmm4 - vpaddd %xmm8, %xmm9, %xmm6 - vpaddd %xmm12, %xmm13, %xmm7 - vprotd $7, %xmm4, %xmm4 - vprotd $7, %xmm6, %xmm6 - vprotd $7, %xmm7, %xmm7 - vpxor %xmm4, %xmm3, %xmm3 - vpxor %xmm6, %xmm11, %xmm11 - vpxor %xmm7, %xmm15, %xmm15 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 - vpaddd %xmm3, %xmm0, %xmm4 - vpaddd %xmm11, %xmm8, %xmm6 - vpaddd %xmm15, %xmm12, %xmm7 - vprotd $9, %xmm4, %xmm4 - vprotd $9, %xmm6, %xmm6 - vprotd $9, %xmm7, %xmm7 - vpxor %xmm4, %xmm2, %xmm2 - vpxor %xmm6, %xmm10, %xmm10 - vpxor %xmm7, %xmm14, %xmm14 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 - vpaddd %xmm2, %xmm3, %xmm4 - vpaddd %xmm10, %xmm11, %xmm6 - vpaddd %xmm14, %xmm15, %xmm7 - vprotd $13, %xmm4, %xmm4 - vprotd $13, %xmm6, %xmm6 - vprotd $13, %xmm7, %xmm7 - vpshufd $0x93, %xmm3, %xmm3 - vpshufd $0x93, %xmm11, %xmm11 - vpshufd $0x93, %xmm15, %xmm15 - vpxor %xmm4, %xmm1, %xmm1 - vpxor %xmm6, %xmm9, %xmm9 - vpxor %xmm7, %xmm13, %xmm13 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 - vpaddd %xmm1, %xmm2, %xmm4 - vpaddd %xmm9, %xmm10, %xmm6 - vpaddd %xmm13, %xmm14, %xmm7 - vprotd $18, %xmm4, %xmm4 - vprotd $18, %xmm6, %xmm6 - vprotd $18, %xmm7, %xmm7 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm6, %xmm8, %xmm8 - vpxor %xmm4, %xmm0, %xmm0 - vpxor %xmm7, %xmm12, %xmm12 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 - vpaddd %xmm0, %xmm3, %xmm4 - vpaddd %xmm8, %xmm11, %xmm6 - vpaddd %xmm12, %xmm15, %xmm7 - vprotd $7, %xmm4, %xmm4 - vprotd $7, %xmm6, %xmm6 - vprotd $7, %xmm7, %xmm7 - vpshufd $0x39, %xmm1, %xmm1 - vpshufd $0x39, %xmm9, %xmm9 - vpshufd $0x39, %xmm13, %xmm13 - vpxor %xmm4, %xmm1, %xmm1 - vpxor %xmm6, %xmm9, %xmm9 - vpxor %xmm7, %xmm13, %xmm13 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 - vpaddd %xmm1, %xmm0, %xmm4 - vpaddd %xmm9, %xmm8, %xmm6 - vpaddd %xmm13, %xmm12, %xmm7 - vprotd $9, %xmm4, %xmm4 - vprotd $9, %xmm6, %xmm6 - vprotd $9, %xmm7, %xmm7 - vpxor %xmm4, %xmm2, %xmm2 - vpxor %xmm6, %xmm10, %xmm10 - vpxor %xmm7, %xmm14, %xmm14 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 - vpaddd %xmm2, %xmm1, %xmm4 - vpaddd %xmm10, %xmm9, %xmm6 - vpaddd %xmm14, %xmm13, %xmm7 - vprotd $13, %xmm4, %xmm4 - vprotd $13, %xmm6, %xmm6 - vprotd $13, %xmm7, %xmm7 - vpshufd $0x93, %xmm1, %xmm1 - vpshufd $0x93, %xmm9, %xmm9 - vpshufd $0x93, %xmm13, %xmm13 - vpxor %xmm4, %xmm3, %xmm3 - vpxor %xmm6, %xmm11, %xmm11 - vpxor %xmm7, %xmm15, %xmm15 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 - vpaddd %xmm3, %xmm2, %xmm4 - vpaddd %xmm11, %xmm10, %xmm6 - vpaddd %xmm15, %xmm14, %xmm7 - vprotd $18, %xmm4, %xmm4 - vprotd $18, %xmm6, %xmm6 - vprotd $18, %xmm7, %xmm7 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm4, %xmm0, %xmm0 - vpxor %xmm6, %xmm8, %xmm8 - vpxor %xmm7, %xmm12, %xmm12 - vpshufd $0x39, %xmm3, %xmm3 - vpshufd $0x39, %xmm11, %xmm11 - vpshufd $0x39, %xmm15, %xmm15 -.endm - -.macro salsa8_core_3way_xop - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround -.endm + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 - .p2align 6 -scrypt_core_3way_xop: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 - movdqa 64(%rsp), %xmm0 - movdqa 80(%rsp), %xmm1 - movdqa 96(%rsp), %xmm2 - movdqa 112(%rsp), %xmm3 - movdqa 128+64(%rsp), %xmm8 - movdqa 128+80(%rsp), %xmm9 - movdqa 128+96(%rsp), %xmm10 - movdqa 128+112(%rsp), %xmm11 - movdqa 256+64(%rsp), %xmm12 - movdqa 256+80(%rsp), %xmm13 - movdqa 256+96(%rsp), %xmm14 - movdqa 256+112(%rsp), %xmm15 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 - movq %rsi, %rbx - leaq (%r8, %r8, 2), %rax - shlq $7, %rax - addq %rsi, %rax -scrypt_core_3way_xop_loop1: - movdqa %xmm0, 64(%rbx) - movdqa %xmm1, 80(%rbx) - movdqa %xmm2, 96(%rbx) - movdqa %xmm3, 112(%rbx) - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - movdqa %xmm8, 128+64(%rbx) - movdqa %xmm9, 128+80(%rbx) - movdqa %xmm10, 128+96(%rbx) - movdqa %xmm11, 128+112(%rbx) - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - movdqa %xmm12, 256+64(%rbx) - movdqa %xmm13, 256+80(%rbx) - movdqa %xmm14, 256+96(%rbx) - movdqa %xmm15, 256+112(%rbx) - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rbx) - movdqa %xmm1, 16(%rbx) - movdqa %xmm2, 32(%rbx) - movdqa %xmm3, 48(%rbx) - movdqa %xmm8, 128+0(%rbx) - movdqa %xmm9, 128+16(%rbx) - movdqa %xmm10, 128+32(%rbx) - movdqa %xmm11, 128+48(%rbx) - movdqa %xmm12, 256+0(%rbx) - movdqa %xmm13, 256+16(%rbx) - movdqa %xmm14, 256+32(%rbx) - movdqa %xmm15, 256+48(%rbx) + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 - salsa8_core_3way_xop - paddd 0(%rbx), %xmm0 - paddd 16(%rbx), %xmm1 - paddd 32(%rbx), %xmm2 - paddd 48(%rbx), %xmm3 - paddd 128+0(%rbx), %xmm8 - paddd 128+16(%rbx), %xmm9 - paddd 128+32(%rbx), %xmm10 - paddd 128+48(%rbx), %xmm11 - paddd 256+0(%rbx), %xmm12 - paddd 256+16(%rbx), %xmm13 - paddd 256+32(%rbx), %xmm14 - paddd 256+48(%rbx), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 - pxor 64(%rbx), %xmm0 - pxor 80(%rbx), %xmm1 - pxor 96(%rbx), %xmm2 - pxor 112(%rbx), %xmm3 - pxor 128+64(%rbx), %xmm8 - pxor 128+80(%rbx), %xmm9 - pxor 128+96(%rbx), %xmm10 - pxor 128+112(%rbx), %xmm11 - pxor 256+64(%rbx), %xmm12 - pxor 256+80(%rbx), %xmm13 - pxor 256+96(%rbx), %xmm14 - pxor 256+112(%rbx), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xop - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 - addq $3*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_3way_xop_loop1 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - movq %r8, %rcx - subq $1, %r8 -scrypt_core_3way_xop_loop2: - movd %xmm0, %ebp - movd %xmm8, %ebx - movd %xmm12, %eax - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - andl %r8d, %ebp - leaq (%rbp, %rbp, 2), %rbp - shll $7, %ebp - andl %r8d, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $7, %ebx - andl %r8d, %eax - leaq 2(%rax, %rax, 2), %rax - shll $7, %eax - pxor 0(%rsi, %rbp), %xmm0 - pxor 16(%rsi, %rbp), %xmm1 - pxor 32(%rsi, %rbp), %xmm2 - pxor 48(%rsi, %rbp), %xmm3 - pxor 0(%rsi, %rbx), %xmm8 - pxor 16(%rsi, %rbx), %xmm9 - pxor 32(%rsi, %rbx), %xmm10 - pxor 48(%rsi, %rbx), %xmm11 - pxor 0(%rsi, %rax), %xmm12 - pxor 16(%rsi, %rax), %xmm13 - pxor 32(%rsi, %rax), %xmm14 - pxor 48(%rsi, %rax), %xmm15 - - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_xop - paddd 0(%rsp), %xmm0 - paddd 16(%rsp), %xmm1 - paddd 32(%rsp), %xmm2 - paddd 48(%rsp), %xmm3 - paddd 128+0(%rsp), %xmm8 - paddd 128+16(%rsp), %xmm9 - paddd 128+32(%rsp), %xmm10 - paddd 128+48(%rsp), %xmm11 - paddd 256+0(%rsp), %xmm12 - paddd 256+16(%rsp), %xmm13 - paddd 256+32(%rsp), %xmm14 - paddd 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rsi, %rbp), %xmm0 - pxor 80(%rsi, %rbp), %xmm1 - pxor 96(%rsi, %rbp), %xmm2 - pxor 112(%rsi, %rbp), %xmm3 - pxor 64(%rsi, %rbx), %xmm8 - pxor 80(%rsi, %rbx), %xmm9 - pxor 96(%rsi, %rbx), %xmm10 - pxor 112(%rsi, %rbx), %xmm11 - pxor 64(%rsi, %rax), %xmm12 - pxor 80(%rsi, %rax), %xmm13 - pxor 96(%rsi, %rax), %xmm14 - pxor 112(%rsi, %rax), %xmm15 - pxor 64(%rsp), %xmm0 - pxor 80(%rsp), %xmm1 - pxor 96(%rsp), %xmm2 - pxor 112(%rsp), %xmm3 - pxor 128+64(%rsp), %xmm8 - pxor 128+80(%rsp), %xmm9 - pxor 128+96(%rsp), %xmm10 - pxor 128+112(%rsp), %xmm11 - pxor 256+64(%rsp), %xmm12 - pxor 256+80(%rsp), %xmm13 - pxor 256+96(%rsp), %xmm14 - pxor 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xop - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - subq $1, %rcx - ja scrypt_core_3way_xop_loop2 - - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 - - scrypt_core_3way_cleanup - ret -#endif /* USE_XOP */ -#endif /* USE_AVX */ - -.macro salsa8_core_3way_xmm_doubleround - movdqa %xmm1, %xmm4 - movdqa %xmm9, %xmm6 - movdqa %xmm13, %xmm7 - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - pxor %xmm5, %xmm3 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm11 - pxor %xmm5, %xmm11 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm15 - pxor %xmm5, %xmm15 - movdqa %xmm12, %xmm7 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm3, %xmm4 - pshufd $0x93, %xmm3, %xmm3 pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm11, %xmm6 - pshufd $0x93, %xmm11, %xmm11 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm15, %xmm7 - pxor %xmm5, %xmm14 - pshufd $0x93, %xmm15, %xmm15 + pshufd $0x93, %xmm3, %xmm3 paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm1 movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 pxor %xmm5, %xmm1 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm9 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm9 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm13 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm13 + pshufd $0x4e, %xmm2, %xmm2 paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - pxor %xmm5, %xmm0 movdqa %xmm3, %xmm4 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm9, %xmm9 - pxor %xmm5, %xmm8 - movdqa %xmm11, %xmm6 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - movdqa %xmm15, %xmm7 - pxor %xmm5, %xmm12 - pshufd $0x39, %xmm13, %xmm13 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm1 - pxor %xmm5, %xmm1 movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm9 - pxor %xmm5, %xmm9 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm13 - pxor %xmm5, %xmm13 - movdqa %xmm12, %xmm7 + pxor %xmm5, %xmm1 paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm1, %xmm4 - pshufd $0x93, %xmm1, %xmm1 pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm9, %xmm6 - pshufd $0x93, %xmm9, %xmm9 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm13, %xmm7 - pshufd $0x93, %xmm13, %xmm13 - pxor %xmm5, %xmm14 + pshufd $0x93, %xmm1, %xmm1 paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm3 movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 pxor %xmm5, %xmm3 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm11 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm11 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm15 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm15 + pshufd $0x4e, %xmm2, %xmm2 paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 pshufd $0x39, %xmm3, %xmm3 pxor %xmm5, %xmm0 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm11, %xmm11 - pxor %xmm5, %xmm8 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, %xmm0 + movdqa %xmm13, %xmm1 + movdqa %xmm14, %xmm2 + movdqa %xmm15, %xmm3 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 psrld $14, %xmm5 - pxor %xmm7, %xmm12 - pshufd $0x39, %xmm15, %xmm15 - pxor %xmm5, %xmm12 -.endm - -.macro salsa8_core_3way_xmm - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround -.endm + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 - .p2align 6 -scrypt_core_3way_xmm: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 - movdqa 64(%rsp), %xmm0 - movdqa 80(%rsp), %xmm1 - movdqa 96(%rsp), %xmm2 - movdqa 112(%rsp), %xmm3 - movdqa 128+64(%rsp), %xmm8 - movdqa 128+80(%rsp), %xmm9 - movdqa 128+96(%rsp), %xmm10 - movdqa 128+112(%rsp), %xmm11 - movdqa 256+64(%rsp), %xmm12 - movdqa 256+80(%rsp), %xmm13 - movdqa 256+96(%rsp), %xmm14 - movdqa 256+112(%rsp), %xmm15 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 - movq %rsi, %rbx - leaq (%r8, %r8, 2), %rax - shlq $7, %rax - addq %rsi, %rax -scrypt_core_3way_xmm_loop1: - movdqa %xmm0, 64(%rbx) - movdqa %xmm1, 80(%rbx) - movdqa %xmm2, 96(%rbx) - movdqa %xmm3, 112(%rbx) - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - movdqa %xmm8, 128+64(%rbx) - movdqa %xmm9, 128+80(%rbx) - movdqa %xmm10, 128+96(%rbx) - movdqa %xmm11, 128+112(%rbx) - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - movdqa %xmm12, 256+64(%rbx) - movdqa %xmm13, 256+80(%rbx) - movdqa %xmm14, 256+96(%rbx) - movdqa %xmm15, 256+112(%rbx) - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rbx) - movdqa %xmm1, 16(%rbx) - movdqa %xmm2, 32(%rbx) - movdqa %xmm3, 48(%rbx) - movdqa %xmm8, 128+0(%rbx) - movdqa %xmm9, 128+16(%rbx) - movdqa %xmm10, 128+32(%rbx) - movdqa %xmm11, 128+48(%rbx) - movdqa %xmm12, 256+0(%rbx) - movdqa %xmm13, 256+16(%rbx) - movdqa %xmm14, 256+32(%rbx) - movdqa %xmm15, 256+48(%rbx) + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 - salsa8_core_3way_xmm - paddd 0(%rbx), %xmm0 - paddd 16(%rbx), %xmm1 - paddd 32(%rbx), %xmm2 - paddd 48(%rbx), %xmm3 - paddd 128+0(%rbx), %xmm8 - paddd 128+16(%rbx), %xmm9 - paddd 128+32(%rbx), %xmm10 - paddd 128+48(%rbx), %xmm11 - paddd 256+0(%rbx), %xmm12 - paddd 256+16(%rbx), %xmm13 - paddd 256+32(%rbx), %xmm14 - paddd 256+48(%rbx), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 - pxor 64(%rbx), %xmm0 - pxor 80(%rbx), %xmm1 - pxor 96(%rbx), %xmm2 - pxor 112(%rbx), %xmm3 - pxor 128+64(%rbx), %xmm8 - pxor 128+80(%rbx), %xmm9 - pxor 128+96(%rbx), %xmm10 - pxor 128+112(%rbx), %xmm11 - pxor 256+64(%rbx), %xmm12 - pxor 256+80(%rbx), %xmm13 - pxor 256+96(%rbx), %xmm14 - pxor 256+112(%rbx), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xmm - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 - addq $3*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_3way_xmm_loop1 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 - movq %r8, %rcx - subq $1, %r8 -scrypt_core_3way_xmm_loop2: - movd %xmm0, %ebp + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + addq $128, %rdx + cmpq %rcx, %rdx + jne scrypt_core_xmm_loop1 + + movq %r8, %rcx + subl $1, %r8d +scrypt_core_xmm_loop2: + movd %xmm12, %edx + andl %r8d, %edx + shll $7, %edx + pxor 0(%rsi, %rdx), %xmm8 + pxor 16(%rsi, %rdx), %xmm9 + pxor 32(%rsi, %rdx), %xmm10 + pxor 48(%rsi, %rdx), %xmm11 + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor 64(%rsi, %rdx), %xmm12 + pxor 80(%rsi, %rdx), %xmm13 + pxor 96(%rsi, %rdx), %xmm14 + pxor 112(%rsi, %rdx), %xmm15 + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, %xmm0 + movdqa %xmm13, %xmm1 + movdqa %xmm14, %xmm2 + movdqa %xmm15, %xmm3 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + subq $1, %rcx + ja scrypt_core_xmm_loop2 + + pcmpeqw %xmm1, %xmm1 + psrlq $32, %xmm1 + + movdqa %xmm8, %xmm0 + pxor %xmm9, %xmm8 + pand %xmm1, %xmm8 + pxor %xmm9, %xmm8 + pxor %xmm10, %xmm9 + pand %xmm1, %xmm9 + pxor %xmm10, %xmm9 + pxor %xmm11, %xmm10 + pand %xmm1, %xmm10 + pxor %xmm11, %xmm10 + pxor %xmm0, %xmm11 + pand %xmm1, %xmm11 + pxor %xmm0, %xmm11 + movdqa %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm10 + punpcklqdq %xmm10, %xmm8 + punpckhqdq %xmm0, %xmm10 + movdqa %xmm9, %xmm0 + pshufd $0x4e, %xmm11, %xmm11 + punpcklqdq %xmm11, %xmm9 + punpckhqdq %xmm0, %xmm11 + movdqa %xmm8, 0(%rdi) + movdqa %xmm11, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm9, 48(%rdi) + + movdqa %xmm12, %xmm0 + pxor %xmm13, %xmm12 + pand %xmm1, %xmm12 + pxor %xmm13, %xmm12 + pxor %xmm14, %xmm13 + pand %xmm1, %xmm13 + pxor %xmm14, %xmm13 + pxor %xmm15, %xmm14 + pand %xmm1, %xmm14 + pxor %xmm15, %xmm14 + pxor %xmm0, %xmm15 + pand %xmm1, %xmm15 + pxor %xmm0, %xmm15 + movdqa %xmm12, %xmm0 + pshufd $0x4e, %xmm14, %xmm14 + punpcklqdq %xmm14, %xmm12 + punpckhqdq %xmm0, %xmm14 + movdqa %xmm13, %xmm0 + pshufd $0x4e, %xmm15, %xmm15 + punpcklqdq %xmm15, %xmm13 + punpckhqdq %xmm0, %xmm15 + movdqa %xmm12, 64(%rdi) + movdqa %xmm15, 80(%rdi) + movdqa %xmm14, 96(%rdi) + movdqa %xmm13, 112(%rdi) + +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + + +#if defined(USE_AVX) + +#endif /* USE_AVX */ + + .text + .p2align 6 + .globl scrypt_core_3way + .globl _scrypt_core_3way +scrypt_core_3way: +_scrypt_core_3way: + pushq %rbx + pushq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#else + movq %rdx, %r8 +#endif + subq $392, %rsp + + +#if !defined(USE_AVX) + jmp scrypt_core_3way_xmm +#else + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne scrypt_core_3way_xmm + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne scrypt_core_3way_xmm +#if defined(USE_XOP) + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jnz scrypt_core_3way_xop +#endif + +scrypt_core_3way_avx: + movl 0+60(%rdi), %eax + movl 0+44(%rdi), %ebx + movl 0+28(%rdi), %ecx + movl 0+12(%rdi), %edx + movl %eax, 0+12(%rsp) + movl %ebx, 0+28(%rsp) + movl %ecx, 0+44(%rsp) + movl %edx, 0+60(%rsp) + movl 0+40(%rdi), %eax + movl 0+8(%rdi), %ebx + movl 0+48(%rdi), %ecx + movl 0+16(%rdi), %edx + movl %eax, 0+8(%rsp) + movl %ebx, 0+40(%rsp) + movl %ecx, 0+16(%rsp) + movl %edx, 0+48(%rsp) + movl 0+20(%rdi), %eax + movl 0+4(%rdi), %ebx + movl 0+52(%rdi), %ecx + movl 0+36(%rdi), %edx + movl %eax, 0+4(%rsp) + movl %ebx, 0+20(%rsp) + movl %ecx, 0+36(%rsp) + movl %edx, 0+52(%rsp) + movl 0+0(%rdi), %eax + movl 0+24(%rdi), %ebx + movl 0+32(%rdi), %ecx + movl 0+56(%rdi), %edx + movl %eax, 0+0(%rsp) + movl %ebx, 0+24(%rsp) + movl %ecx, 0+32(%rsp) + movl %edx, 0+56(%rsp) + movl 64+60(%rdi), %eax + movl 64+44(%rdi), %ebx + movl 64+28(%rdi), %ecx + movl 64+12(%rdi), %edx + movl %eax, 64+12(%rsp) + movl %ebx, 64+28(%rsp) + movl %ecx, 64+44(%rsp) + movl %edx, 64+60(%rsp) + movl 64+40(%rdi), %eax + movl 64+8(%rdi), %ebx + movl 64+48(%rdi), %ecx + movl 64+16(%rdi), %edx + movl %eax, 64+8(%rsp) + movl %ebx, 64+40(%rsp) + movl %ecx, 64+16(%rsp) + movl %edx, 64+48(%rsp) + movl 64+20(%rdi), %eax + movl 64+4(%rdi), %ebx + movl 64+52(%rdi), %ecx + movl 64+36(%rdi), %edx + movl %eax, 64+4(%rsp) + movl %ebx, 64+20(%rsp) + movl %ecx, 64+36(%rsp) + movl %edx, 64+52(%rsp) + movl 64+0(%rdi), %eax + movl 64+24(%rdi), %ebx + movl 64+32(%rdi), %ecx + movl 64+56(%rdi), %edx + movl %eax, 64+0(%rsp) + movl %ebx, 64+24(%rsp) + movl %ecx, 64+32(%rsp) + movl %edx, 64+56(%rsp) + movl 128+60(%rdi), %eax + movl 128+44(%rdi), %ebx + movl 128+28(%rdi), %ecx + movl 128+12(%rdi), %edx + movl %eax, 128+12(%rsp) + movl %ebx, 128+28(%rsp) + movl %ecx, 128+44(%rsp) + movl %edx, 128+60(%rsp) + movl 128+40(%rdi), %eax + movl 128+8(%rdi), %ebx + movl 128+48(%rdi), %ecx + movl 128+16(%rdi), %edx + movl %eax, 128+8(%rsp) + movl %ebx, 128+40(%rsp) + movl %ecx, 128+16(%rsp) + movl %edx, 128+48(%rsp) + movl 128+20(%rdi), %eax + movl 128+4(%rdi), %ebx + movl 128+52(%rdi), %ecx + movl 128+36(%rdi), %edx + movl %eax, 128+4(%rsp) + movl %ebx, 128+20(%rsp) + movl %ecx, 128+36(%rsp) + movl %edx, 128+52(%rsp) + movl 128+0(%rdi), %eax + movl 128+24(%rdi), %ebx + movl 128+32(%rdi), %ecx + movl 128+56(%rdi), %edx + movl %eax, 128+0(%rsp) + movl %ebx, 128+24(%rsp) + movl %ecx, 128+32(%rsp) + movl %edx, 128+56(%rsp) + movl 192+60(%rdi), %eax + movl 192+44(%rdi), %ebx + movl 192+28(%rdi), %ecx + movl 192+12(%rdi), %edx + movl %eax, 192+12(%rsp) + movl %ebx, 192+28(%rsp) + movl %ecx, 192+44(%rsp) + movl %edx, 192+60(%rsp) + movl 192+40(%rdi), %eax + movl 192+8(%rdi), %ebx + movl 192+48(%rdi), %ecx + movl 192+16(%rdi), %edx + movl %eax, 192+8(%rsp) + movl %ebx, 192+40(%rsp) + movl %ecx, 192+16(%rsp) + movl %edx, 192+48(%rsp) + movl 192+20(%rdi), %eax + movl 192+4(%rdi), %ebx + movl 192+52(%rdi), %ecx + movl 192+36(%rdi), %edx + movl %eax, 192+4(%rsp) + movl %ebx, 192+20(%rsp) + movl %ecx, 192+36(%rsp) + movl %edx, 192+52(%rsp) + movl 192+0(%rdi), %eax + movl 192+24(%rdi), %ebx + movl 192+32(%rdi), %ecx + movl 192+56(%rdi), %edx + movl %eax, 192+0(%rsp) + movl %ebx, 192+24(%rsp) + movl %ecx, 192+32(%rsp) + movl %edx, 192+56(%rsp) + movl 256+60(%rdi), %eax + movl 256+44(%rdi), %ebx + movl 256+28(%rdi), %ecx + movl 256+12(%rdi), %edx + movl %eax, 256+12(%rsp) + movl %ebx, 256+28(%rsp) + movl %ecx, 256+44(%rsp) + movl %edx, 256+60(%rsp) + movl 256+40(%rdi), %eax + movl 256+8(%rdi), %ebx + movl 256+48(%rdi), %ecx + movl 256+16(%rdi), %edx + movl %eax, 256+8(%rsp) + movl %ebx, 256+40(%rsp) + movl %ecx, 256+16(%rsp) + movl %edx, 256+48(%rsp) + movl 256+20(%rdi), %eax + movl 256+4(%rdi), %ebx + movl 256+52(%rdi), %ecx + movl 256+36(%rdi), %edx + movl %eax, 256+4(%rsp) + movl %ebx, 256+20(%rsp) + movl %ecx, 256+36(%rsp) + movl %edx, 256+52(%rsp) + movl 256+0(%rdi), %eax + movl 256+24(%rdi), %ebx + movl 256+32(%rdi), %ecx + movl 256+56(%rdi), %edx + movl %eax, 256+0(%rsp) + movl %ebx, 256+24(%rsp) + movl %ecx, 256+32(%rsp) + movl %edx, 256+56(%rsp) + movl 320+60(%rdi), %eax + movl 320+44(%rdi), %ebx + movl 320+28(%rdi), %ecx + movl 320+12(%rdi), %edx + movl %eax, 320+12(%rsp) + movl %ebx, 320+28(%rsp) + movl %ecx, 320+44(%rsp) + movl %edx, 320+60(%rsp) + movl 320+40(%rdi), %eax + movl 320+8(%rdi), %ebx + movl 320+48(%rdi), %ecx + movl 320+16(%rdi), %edx + movl %eax, 320+8(%rsp) + movl %ebx, 320+40(%rsp) + movl %ecx, 320+16(%rsp) + movl %edx, 320+48(%rsp) + movl 320+20(%rdi), %eax + movl 320+4(%rdi), %ebx + movl 320+52(%rdi), %ecx + movl 320+36(%rdi), %edx + movl %eax, 320+4(%rsp) + movl %ebx, 320+20(%rsp) + movl %ecx, 320+36(%rsp) + movl %edx, 320+52(%rsp) + movl 320+0(%rdi), %eax + movl 320+24(%rdi), %ebx + movl 320+32(%rdi), %ecx + movl 320+56(%rdi), %edx + movl %eax, 320+0(%rsp) + movl %ebx, 320+24(%rsp) + movl %ecx, 320+32(%rsp) + movl %edx, 320+56(%rsp) + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax +scrypt_core_3way_avx_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_avx_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq %r8, %rcx + subq $1, %r8 +scrypt_core_3way_avx_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl %r8d, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl %r8d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl %r8d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_avx_loop2 + + movl 0+60(%rsp), %eax + movl 0+44(%rsp), %ebx + movl 0+28(%rsp), %ecx + movl 0+12(%rsp), %edx + movl %eax, 0+12(%rdi) + movl %ebx, 0+28(%rdi) + movl %ecx, 0+44(%rdi) + movl %edx, 0+60(%rdi) + movl 0+40(%rsp), %eax + movl 0+8(%rsp), %ebx + movl 0+48(%rsp), %ecx + movl 0+16(%rsp), %edx + movl %eax, 0+8(%rdi) + movl %ebx, 0+40(%rdi) + movl %ecx, 0+16(%rdi) + movl %edx, 0+48(%rdi) + movl 0+20(%rsp), %eax + movl 0+4(%rsp), %ebx + movl 0+52(%rsp), %ecx + movl 0+36(%rsp), %edx + movl %eax, 0+4(%rdi) + movl %ebx, 0+20(%rdi) + movl %ecx, 0+36(%rdi) + movl %edx, 0+52(%rdi) + movl 0+0(%rsp), %eax + movl 0+24(%rsp), %ebx + movl 0+32(%rsp), %ecx + movl 0+56(%rsp), %edx + movl %eax, 0+0(%rdi) + movl %ebx, 0+24(%rdi) + movl %ecx, 0+32(%rdi) + movl %edx, 0+56(%rdi) + movl 64+60(%rsp), %eax + movl 64+44(%rsp), %ebx + movl 64+28(%rsp), %ecx + movl 64+12(%rsp), %edx + movl %eax, 64+12(%rdi) + movl %ebx, 64+28(%rdi) + movl %ecx, 64+44(%rdi) + movl %edx, 64+60(%rdi) + movl 64+40(%rsp), %eax + movl 64+8(%rsp), %ebx + movl 64+48(%rsp), %ecx + movl 64+16(%rsp), %edx + movl %eax, 64+8(%rdi) + movl %ebx, 64+40(%rdi) + movl %ecx, 64+16(%rdi) + movl %edx, 64+48(%rdi) + movl 64+20(%rsp), %eax + movl 64+4(%rsp), %ebx + movl 64+52(%rsp), %ecx + movl 64+36(%rsp), %edx + movl %eax, 64+4(%rdi) + movl %ebx, 64+20(%rdi) + movl %ecx, 64+36(%rdi) + movl %edx, 64+52(%rdi) + movl 64+0(%rsp), %eax + movl 64+24(%rsp), %ebx + movl 64+32(%rsp), %ecx + movl 64+56(%rsp), %edx + movl %eax, 64+0(%rdi) + movl %ebx, 64+24(%rdi) + movl %ecx, 64+32(%rdi) + movl %edx, 64+56(%rdi) + movl 128+60(%rsp), %eax + movl 128+44(%rsp), %ebx + movl 128+28(%rsp), %ecx + movl 128+12(%rsp), %edx + movl %eax, 128+12(%rdi) + movl %ebx, 128+28(%rdi) + movl %ecx, 128+44(%rdi) + movl %edx, 128+60(%rdi) + movl 128+40(%rsp), %eax + movl 128+8(%rsp), %ebx + movl 128+48(%rsp), %ecx + movl 128+16(%rsp), %edx + movl %eax, 128+8(%rdi) + movl %ebx, 128+40(%rdi) + movl %ecx, 128+16(%rdi) + movl %edx, 128+48(%rdi) + movl 128+20(%rsp), %eax + movl 128+4(%rsp), %ebx + movl 128+52(%rsp), %ecx + movl 128+36(%rsp), %edx + movl %eax, 128+4(%rdi) + movl %ebx, 128+20(%rdi) + movl %ecx, 128+36(%rdi) + movl %edx, 128+52(%rdi) + movl 128+0(%rsp), %eax + movl 128+24(%rsp), %ebx + movl 128+32(%rsp), %ecx + movl 128+56(%rsp), %edx + movl %eax, 128+0(%rdi) + movl %ebx, 128+24(%rdi) + movl %ecx, 128+32(%rdi) + movl %edx, 128+56(%rdi) + movl 192+60(%rsp), %eax + movl 192+44(%rsp), %ebx + movl 192+28(%rsp), %ecx + movl 192+12(%rsp), %edx + movl %eax, 192+12(%rdi) + movl %ebx, 192+28(%rdi) + movl %ecx, 192+44(%rdi) + movl %edx, 192+60(%rdi) + movl 192+40(%rsp), %eax + movl 192+8(%rsp), %ebx + movl 192+48(%rsp), %ecx + movl 192+16(%rsp), %edx + movl %eax, 192+8(%rdi) + movl %ebx, 192+40(%rdi) + movl %ecx, 192+16(%rdi) + movl %edx, 192+48(%rdi) + movl 192+20(%rsp), %eax + movl 192+4(%rsp), %ebx + movl 192+52(%rsp), %ecx + movl 192+36(%rsp), %edx + movl %eax, 192+4(%rdi) + movl %ebx, 192+20(%rdi) + movl %ecx, 192+36(%rdi) + movl %edx, 192+52(%rdi) + movl 192+0(%rsp), %eax + movl 192+24(%rsp), %ebx + movl 192+32(%rsp), %ecx + movl 192+56(%rsp), %edx + movl %eax, 192+0(%rdi) + movl %ebx, 192+24(%rdi) + movl %ecx, 192+32(%rdi) + movl %edx, 192+56(%rdi) + movl 256+60(%rsp), %eax + movl 256+44(%rsp), %ebx + movl 256+28(%rsp), %ecx + movl 256+12(%rsp), %edx + movl %eax, 256+12(%rdi) + movl %ebx, 256+28(%rdi) + movl %ecx, 256+44(%rdi) + movl %edx, 256+60(%rdi) + movl 256+40(%rsp), %eax + movl 256+8(%rsp), %ebx + movl 256+48(%rsp), %ecx + movl 256+16(%rsp), %edx + movl %eax, 256+8(%rdi) + movl %ebx, 256+40(%rdi) + movl %ecx, 256+16(%rdi) + movl %edx, 256+48(%rdi) + movl 256+20(%rsp), %eax + movl 256+4(%rsp), %ebx + movl 256+52(%rsp), %ecx + movl 256+36(%rsp), %edx + movl %eax, 256+4(%rdi) + movl %ebx, 256+20(%rdi) + movl %ecx, 256+36(%rdi) + movl %edx, 256+52(%rdi) + movl 256+0(%rsp), %eax + movl 256+24(%rsp), %ebx + movl 256+32(%rsp), %ecx + movl 256+56(%rsp), %edx + movl %eax, 256+0(%rdi) + movl %ebx, 256+24(%rdi) + movl %ecx, 256+32(%rdi) + movl %edx, 256+56(%rdi) + movl 320+60(%rsp), %eax + movl 320+44(%rsp), %ebx + movl 320+28(%rsp), %ecx + movl 320+12(%rsp), %edx + movl %eax, 320+12(%rdi) + movl %ebx, 320+28(%rdi) + movl %ecx, 320+44(%rdi) + movl %edx, 320+60(%rdi) + movl 320+40(%rsp), %eax + movl 320+8(%rsp), %ebx + movl 320+48(%rsp), %ecx + movl 320+16(%rsp), %edx + movl %eax, 320+8(%rdi) + movl %ebx, 320+40(%rdi) + movl %ecx, 320+16(%rdi) + movl %edx, 320+48(%rdi) + movl 320+20(%rsp), %eax + movl 320+4(%rsp), %ebx + movl 320+52(%rsp), %ecx + movl 320+36(%rsp), %edx + movl %eax, 320+4(%rdi) + movl %ebx, 320+20(%rdi) + movl %ecx, 320+36(%rdi) + movl %edx, 320+52(%rdi) + movl 320+0(%rsp), %eax + movl 320+24(%rsp), %ebx + movl 320+32(%rsp), %ecx + movl 320+56(%rsp), %edx + movl %eax, 320+0(%rdi) + movl %ebx, 320+24(%rdi) + movl %ecx, 320+32(%rdi) + movl %edx, 320+56(%rdi) + + addq $392, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx + ret + +#if defined(USE_XOP) + + + .p2align 6 +scrypt_core_3way_xop: + movl 0+60(%rdi), %eax + movl 0+44(%rdi), %ebx + movl 0+28(%rdi), %ecx + movl 0+12(%rdi), %edx + movl %eax, 0+12(%rsp) + movl %ebx, 0+28(%rsp) + movl %ecx, 0+44(%rsp) + movl %edx, 0+60(%rsp) + movl 0+40(%rdi), %eax + movl 0+8(%rdi), %ebx + movl 0+48(%rdi), %ecx + movl 0+16(%rdi), %edx + movl %eax, 0+8(%rsp) + movl %ebx, 0+40(%rsp) + movl %ecx, 0+16(%rsp) + movl %edx, 0+48(%rsp) + movl 0+20(%rdi), %eax + movl 0+4(%rdi), %ebx + movl 0+52(%rdi), %ecx + movl 0+36(%rdi), %edx + movl %eax, 0+4(%rsp) + movl %ebx, 0+20(%rsp) + movl %ecx, 0+36(%rsp) + movl %edx, 0+52(%rsp) + movl 0+0(%rdi), %eax + movl 0+24(%rdi), %ebx + movl 0+32(%rdi), %ecx + movl 0+56(%rdi), %edx + movl %eax, 0+0(%rsp) + movl %ebx, 0+24(%rsp) + movl %ecx, 0+32(%rsp) + movl %edx, 0+56(%rsp) + movl 64+60(%rdi), %eax + movl 64+44(%rdi), %ebx + movl 64+28(%rdi), %ecx + movl 64+12(%rdi), %edx + movl %eax, 64+12(%rsp) + movl %ebx, 64+28(%rsp) + movl %ecx, 64+44(%rsp) + movl %edx, 64+60(%rsp) + movl 64+40(%rdi), %eax + movl 64+8(%rdi), %ebx + movl 64+48(%rdi), %ecx + movl 64+16(%rdi), %edx + movl %eax, 64+8(%rsp) + movl %ebx, 64+40(%rsp) + movl %ecx, 64+16(%rsp) + movl %edx, 64+48(%rsp) + movl 64+20(%rdi), %eax + movl 64+4(%rdi), %ebx + movl 64+52(%rdi), %ecx + movl 64+36(%rdi), %edx + movl %eax, 64+4(%rsp) + movl %ebx, 64+20(%rsp) + movl %ecx, 64+36(%rsp) + movl %edx, 64+52(%rsp) + movl 64+0(%rdi), %eax + movl 64+24(%rdi), %ebx + movl 64+32(%rdi), %ecx + movl 64+56(%rdi), %edx + movl %eax, 64+0(%rsp) + movl %ebx, 64+24(%rsp) + movl %ecx, 64+32(%rsp) + movl %edx, 64+56(%rsp) + movl 128+60(%rdi), %eax + movl 128+44(%rdi), %ebx + movl 128+28(%rdi), %ecx + movl 128+12(%rdi), %edx + movl %eax, 128+12(%rsp) + movl %ebx, 128+28(%rsp) + movl %ecx, 128+44(%rsp) + movl %edx, 128+60(%rsp) + movl 128+40(%rdi), %eax + movl 128+8(%rdi), %ebx + movl 128+48(%rdi), %ecx + movl 128+16(%rdi), %edx + movl %eax, 128+8(%rsp) + movl %ebx, 128+40(%rsp) + movl %ecx, 128+16(%rsp) + movl %edx, 128+48(%rsp) + movl 128+20(%rdi), %eax + movl 128+4(%rdi), %ebx + movl 128+52(%rdi), %ecx + movl 128+36(%rdi), %edx + movl %eax, 128+4(%rsp) + movl %ebx, 128+20(%rsp) + movl %ecx, 128+36(%rsp) + movl %edx, 128+52(%rsp) + movl 128+0(%rdi), %eax + movl 128+24(%rdi), %ebx + movl 128+32(%rdi), %ecx + movl 128+56(%rdi), %edx + movl %eax, 128+0(%rsp) + movl %ebx, 128+24(%rsp) + movl %ecx, 128+32(%rsp) + movl %edx, 128+56(%rsp) + movl 192+60(%rdi), %eax + movl 192+44(%rdi), %ebx + movl 192+28(%rdi), %ecx + movl 192+12(%rdi), %edx + movl %eax, 192+12(%rsp) + movl %ebx, 192+28(%rsp) + movl %ecx, 192+44(%rsp) + movl %edx, 192+60(%rsp) + movl 192+40(%rdi), %eax + movl 192+8(%rdi), %ebx + movl 192+48(%rdi), %ecx + movl 192+16(%rdi), %edx + movl %eax, 192+8(%rsp) + movl %ebx, 192+40(%rsp) + movl %ecx, 192+16(%rsp) + movl %edx, 192+48(%rsp) + movl 192+20(%rdi), %eax + movl 192+4(%rdi), %ebx + movl 192+52(%rdi), %ecx + movl 192+36(%rdi), %edx + movl %eax, 192+4(%rsp) + movl %ebx, 192+20(%rsp) + movl %ecx, 192+36(%rsp) + movl %edx, 192+52(%rsp) + movl 192+0(%rdi), %eax + movl 192+24(%rdi), %ebx + movl 192+32(%rdi), %ecx + movl 192+56(%rdi), %edx + movl %eax, 192+0(%rsp) + movl %ebx, 192+24(%rsp) + movl %ecx, 192+32(%rsp) + movl %edx, 192+56(%rsp) + movl 256+60(%rdi), %eax + movl 256+44(%rdi), %ebx + movl 256+28(%rdi), %ecx + movl 256+12(%rdi), %edx + movl %eax, 256+12(%rsp) + movl %ebx, 256+28(%rsp) + movl %ecx, 256+44(%rsp) + movl %edx, 256+60(%rsp) + movl 256+40(%rdi), %eax + movl 256+8(%rdi), %ebx + movl 256+48(%rdi), %ecx + movl 256+16(%rdi), %edx + movl %eax, 256+8(%rsp) + movl %ebx, 256+40(%rsp) + movl %ecx, 256+16(%rsp) + movl %edx, 256+48(%rsp) + movl 256+20(%rdi), %eax + movl 256+4(%rdi), %ebx + movl 256+52(%rdi), %ecx + movl 256+36(%rdi), %edx + movl %eax, 256+4(%rsp) + movl %ebx, 256+20(%rsp) + movl %ecx, 256+36(%rsp) + movl %edx, 256+52(%rsp) + movl 256+0(%rdi), %eax + movl 256+24(%rdi), %ebx + movl 256+32(%rdi), %ecx + movl 256+56(%rdi), %edx + movl %eax, 256+0(%rsp) + movl %ebx, 256+24(%rsp) + movl %ecx, 256+32(%rsp) + movl %edx, 256+56(%rsp) + movl 320+60(%rdi), %eax + movl 320+44(%rdi), %ebx + movl 320+28(%rdi), %ecx + movl 320+12(%rdi), %edx + movl %eax, 320+12(%rsp) + movl %ebx, 320+28(%rsp) + movl %ecx, 320+44(%rsp) + movl %edx, 320+60(%rsp) + movl 320+40(%rdi), %eax + movl 320+8(%rdi), %ebx + movl 320+48(%rdi), %ecx + movl 320+16(%rdi), %edx + movl %eax, 320+8(%rsp) + movl %ebx, 320+40(%rsp) + movl %ecx, 320+16(%rsp) + movl %edx, 320+48(%rsp) + movl 320+20(%rdi), %eax + movl 320+4(%rdi), %ebx + movl 320+52(%rdi), %ecx + movl 320+36(%rdi), %edx + movl %eax, 320+4(%rsp) + movl %ebx, 320+20(%rsp) + movl %ecx, 320+36(%rsp) + movl %edx, 320+52(%rsp) + movl 320+0(%rdi), %eax + movl 320+24(%rdi), %ebx + movl 320+32(%rdi), %ecx + movl 320+56(%rdi), %edx + movl %eax, 320+0(%rsp) + movl %ebx, 320+24(%rsp) + movl %ecx, 320+32(%rsp) + movl %edx, 320+56(%rsp) + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax +scrypt_core_3way_xop_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_xop_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq %r8, %rcx + subq $1, %r8 +scrypt_core_3way_xop_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl %r8d, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl %r8d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl %r8d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_xop_loop2 + + movl 0+60(%rsp), %eax + movl 0+44(%rsp), %ebx + movl 0+28(%rsp), %ecx + movl 0+12(%rsp), %edx + movl %eax, 0+12(%rdi) + movl %ebx, 0+28(%rdi) + movl %ecx, 0+44(%rdi) + movl %edx, 0+60(%rdi) + movl 0+40(%rsp), %eax + movl 0+8(%rsp), %ebx + movl 0+48(%rsp), %ecx + movl 0+16(%rsp), %edx + movl %eax, 0+8(%rdi) + movl %ebx, 0+40(%rdi) + movl %ecx, 0+16(%rdi) + movl %edx, 0+48(%rdi) + movl 0+20(%rsp), %eax + movl 0+4(%rsp), %ebx + movl 0+52(%rsp), %ecx + movl 0+36(%rsp), %edx + movl %eax, 0+4(%rdi) + movl %ebx, 0+20(%rdi) + movl %ecx, 0+36(%rdi) + movl %edx, 0+52(%rdi) + movl 0+0(%rsp), %eax + movl 0+24(%rsp), %ebx + movl 0+32(%rsp), %ecx + movl 0+56(%rsp), %edx + movl %eax, 0+0(%rdi) + movl %ebx, 0+24(%rdi) + movl %ecx, 0+32(%rdi) + movl %edx, 0+56(%rdi) + movl 64+60(%rsp), %eax + movl 64+44(%rsp), %ebx + movl 64+28(%rsp), %ecx + movl 64+12(%rsp), %edx + movl %eax, 64+12(%rdi) + movl %ebx, 64+28(%rdi) + movl %ecx, 64+44(%rdi) + movl %edx, 64+60(%rdi) + movl 64+40(%rsp), %eax + movl 64+8(%rsp), %ebx + movl 64+48(%rsp), %ecx + movl 64+16(%rsp), %edx + movl %eax, 64+8(%rdi) + movl %ebx, 64+40(%rdi) + movl %ecx, 64+16(%rdi) + movl %edx, 64+48(%rdi) + movl 64+20(%rsp), %eax + movl 64+4(%rsp), %ebx + movl 64+52(%rsp), %ecx + movl 64+36(%rsp), %edx + movl %eax, 64+4(%rdi) + movl %ebx, 64+20(%rdi) + movl %ecx, 64+36(%rdi) + movl %edx, 64+52(%rdi) + movl 64+0(%rsp), %eax + movl 64+24(%rsp), %ebx + movl 64+32(%rsp), %ecx + movl 64+56(%rsp), %edx + movl %eax, 64+0(%rdi) + movl %ebx, 64+24(%rdi) + movl %ecx, 64+32(%rdi) + movl %edx, 64+56(%rdi) + movl 128+60(%rsp), %eax + movl 128+44(%rsp), %ebx + movl 128+28(%rsp), %ecx + movl 128+12(%rsp), %edx + movl %eax, 128+12(%rdi) + movl %ebx, 128+28(%rdi) + movl %ecx, 128+44(%rdi) + movl %edx, 128+60(%rdi) + movl 128+40(%rsp), %eax + movl 128+8(%rsp), %ebx + movl 128+48(%rsp), %ecx + movl 128+16(%rsp), %edx + movl %eax, 128+8(%rdi) + movl %ebx, 128+40(%rdi) + movl %ecx, 128+16(%rdi) + movl %edx, 128+48(%rdi) + movl 128+20(%rsp), %eax + movl 128+4(%rsp), %ebx + movl 128+52(%rsp), %ecx + movl 128+36(%rsp), %edx + movl %eax, 128+4(%rdi) + movl %ebx, 128+20(%rdi) + movl %ecx, 128+36(%rdi) + movl %edx, 128+52(%rdi) + movl 128+0(%rsp), %eax + movl 128+24(%rsp), %ebx + movl 128+32(%rsp), %ecx + movl 128+56(%rsp), %edx + movl %eax, 128+0(%rdi) + movl %ebx, 128+24(%rdi) + movl %ecx, 128+32(%rdi) + movl %edx, 128+56(%rdi) + movl 192+60(%rsp), %eax + movl 192+44(%rsp), %ebx + movl 192+28(%rsp), %ecx + movl 192+12(%rsp), %edx + movl %eax, 192+12(%rdi) + movl %ebx, 192+28(%rdi) + movl %ecx, 192+44(%rdi) + movl %edx, 192+60(%rdi) + movl 192+40(%rsp), %eax + movl 192+8(%rsp), %ebx + movl 192+48(%rsp), %ecx + movl 192+16(%rsp), %edx + movl %eax, 192+8(%rdi) + movl %ebx, 192+40(%rdi) + movl %ecx, 192+16(%rdi) + movl %edx, 192+48(%rdi) + movl 192+20(%rsp), %eax + movl 192+4(%rsp), %ebx + movl 192+52(%rsp), %ecx + movl 192+36(%rsp), %edx + movl %eax, 192+4(%rdi) + movl %ebx, 192+20(%rdi) + movl %ecx, 192+36(%rdi) + movl %edx, 192+52(%rdi) + movl 192+0(%rsp), %eax + movl 192+24(%rsp), %ebx + movl 192+32(%rsp), %ecx + movl 192+56(%rsp), %edx + movl %eax, 192+0(%rdi) + movl %ebx, 192+24(%rdi) + movl %ecx, 192+32(%rdi) + movl %edx, 192+56(%rdi) + movl 256+60(%rsp), %eax + movl 256+44(%rsp), %ebx + movl 256+28(%rsp), %ecx + movl 256+12(%rsp), %edx + movl %eax, 256+12(%rdi) + movl %ebx, 256+28(%rdi) + movl %ecx, 256+44(%rdi) + movl %edx, 256+60(%rdi) + movl 256+40(%rsp), %eax + movl 256+8(%rsp), %ebx + movl 256+48(%rsp), %ecx + movl 256+16(%rsp), %edx + movl %eax, 256+8(%rdi) + movl %ebx, 256+40(%rdi) + movl %ecx, 256+16(%rdi) + movl %edx, 256+48(%rdi) + movl 256+20(%rsp), %eax + movl 256+4(%rsp), %ebx + movl 256+52(%rsp), %ecx + movl 256+36(%rsp), %edx + movl %eax, 256+4(%rdi) + movl %ebx, 256+20(%rdi) + movl %ecx, 256+36(%rdi) + movl %edx, 256+52(%rdi) + movl 256+0(%rsp), %eax + movl 256+24(%rsp), %ebx + movl 256+32(%rsp), %ecx + movl 256+56(%rsp), %edx + movl %eax, 256+0(%rdi) + movl %ebx, 256+24(%rdi) + movl %ecx, 256+32(%rdi) + movl %edx, 256+56(%rdi) + movl 320+60(%rsp), %eax + movl 320+44(%rsp), %ebx + movl 320+28(%rsp), %ecx + movl 320+12(%rsp), %edx + movl %eax, 320+12(%rdi) + movl %ebx, 320+28(%rdi) + movl %ecx, 320+44(%rdi) + movl %edx, 320+60(%rdi) + movl 320+40(%rsp), %eax + movl 320+8(%rsp), %ebx + movl 320+48(%rsp), %ecx + movl 320+16(%rsp), %edx + movl %eax, 320+8(%rdi) + movl %ebx, 320+40(%rdi) + movl %ecx, 320+16(%rdi) + movl %edx, 320+48(%rdi) + movl 320+20(%rsp), %eax + movl 320+4(%rsp), %ebx + movl 320+52(%rsp), %ecx + movl 320+36(%rsp), %edx + movl %eax, 320+4(%rdi) + movl %ebx, 320+20(%rdi) + movl %ecx, 320+36(%rdi) + movl %edx, 320+52(%rdi) + movl 320+0(%rsp), %eax + movl 320+24(%rsp), %ebx + movl 320+32(%rsp), %ecx + movl 320+56(%rsp), %edx + movl %eax, 320+0(%rdi) + movl %ebx, 320+24(%rdi) + movl %ecx, 320+32(%rdi) + movl %edx, 320+56(%rdi) + + addq $392, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx + ret +#endif /* USE_XOP */ +#endif /* USE_AVX */ + + + + .p2align 6 +scrypt_core_3way_xmm: + movl 0+60(%rdi), %eax + movl 0+44(%rdi), %ebx + movl 0+28(%rdi), %ecx + movl 0+12(%rdi), %edx + movl %eax, 0+12(%rsp) + movl %ebx, 0+28(%rsp) + movl %ecx, 0+44(%rsp) + movl %edx, 0+60(%rsp) + movl 0+40(%rdi), %eax + movl 0+8(%rdi), %ebx + movl 0+48(%rdi), %ecx + movl 0+16(%rdi), %edx + movl %eax, 0+8(%rsp) + movl %ebx, 0+40(%rsp) + movl %ecx, 0+16(%rsp) + movl %edx, 0+48(%rsp) + movl 0+20(%rdi), %eax + movl 0+4(%rdi), %ebx + movl 0+52(%rdi), %ecx + movl 0+36(%rdi), %edx + movl %eax, 0+4(%rsp) + movl %ebx, 0+20(%rsp) + movl %ecx, 0+36(%rsp) + movl %edx, 0+52(%rsp) + movl 0+0(%rdi), %eax + movl 0+24(%rdi), %ebx + movl 0+32(%rdi), %ecx + movl 0+56(%rdi), %edx + movl %eax, 0+0(%rsp) + movl %ebx, 0+24(%rsp) + movl %ecx, 0+32(%rsp) + movl %edx, 0+56(%rsp) + movl 64+60(%rdi), %eax + movl 64+44(%rdi), %ebx + movl 64+28(%rdi), %ecx + movl 64+12(%rdi), %edx + movl %eax, 64+12(%rsp) + movl %ebx, 64+28(%rsp) + movl %ecx, 64+44(%rsp) + movl %edx, 64+60(%rsp) + movl 64+40(%rdi), %eax + movl 64+8(%rdi), %ebx + movl 64+48(%rdi), %ecx + movl 64+16(%rdi), %edx + movl %eax, 64+8(%rsp) + movl %ebx, 64+40(%rsp) + movl %ecx, 64+16(%rsp) + movl %edx, 64+48(%rsp) + movl 64+20(%rdi), %eax + movl 64+4(%rdi), %ebx + movl 64+52(%rdi), %ecx + movl 64+36(%rdi), %edx + movl %eax, 64+4(%rsp) + movl %ebx, 64+20(%rsp) + movl %ecx, 64+36(%rsp) + movl %edx, 64+52(%rsp) + movl 64+0(%rdi), %eax + movl 64+24(%rdi), %ebx + movl 64+32(%rdi), %ecx + movl 64+56(%rdi), %edx + movl %eax, 64+0(%rsp) + movl %ebx, 64+24(%rsp) + movl %ecx, 64+32(%rsp) + movl %edx, 64+56(%rsp) + movl 128+60(%rdi), %eax + movl 128+44(%rdi), %ebx + movl 128+28(%rdi), %ecx + movl 128+12(%rdi), %edx + movl %eax, 128+12(%rsp) + movl %ebx, 128+28(%rsp) + movl %ecx, 128+44(%rsp) + movl %edx, 128+60(%rsp) + movl 128+40(%rdi), %eax + movl 128+8(%rdi), %ebx + movl 128+48(%rdi), %ecx + movl 128+16(%rdi), %edx + movl %eax, 128+8(%rsp) + movl %ebx, 128+40(%rsp) + movl %ecx, 128+16(%rsp) + movl %edx, 128+48(%rsp) + movl 128+20(%rdi), %eax + movl 128+4(%rdi), %ebx + movl 128+52(%rdi), %ecx + movl 128+36(%rdi), %edx + movl %eax, 128+4(%rsp) + movl %ebx, 128+20(%rsp) + movl %ecx, 128+36(%rsp) + movl %edx, 128+52(%rsp) + movl 128+0(%rdi), %eax + movl 128+24(%rdi), %ebx + movl 128+32(%rdi), %ecx + movl 128+56(%rdi), %edx + movl %eax, 128+0(%rsp) + movl %ebx, 128+24(%rsp) + movl %ecx, 128+32(%rsp) + movl %edx, 128+56(%rsp) + movl 192+60(%rdi), %eax + movl 192+44(%rdi), %ebx + movl 192+28(%rdi), %ecx + movl 192+12(%rdi), %edx + movl %eax, 192+12(%rsp) + movl %ebx, 192+28(%rsp) + movl %ecx, 192+44(%rsp) + movl %edx, 192+60(%rsp) + movl 192+40(%rdi), %eax + movl 192+8(%rdi), %ebx + movl 192+48(%rdi), %ecx + movl 192+16(%rdi), %edx + movl %eax, 192+8(%rsp) + movl %ebx, 192+40(%rsp) + movl %ecx, 192+16(%rsp) + movl %edx, 192+48(%rsp) + movl 192+20(%rdi), %eax + movl 192+4(%rdi), %ebx + movl 192+52(%rdi), %ecx + movl 192+36(%rdi), %edx + movl %eax, 192+4(%rsp) + movl %ebx, 192+20(%rsp) + movl %ecx, 192+36(%rsp) + movl %edx, 192+52(%rsp) + movl 192+0(%rdi), %eax + movl 192+24(%rdi), %ebx + movl 192+32(%rdi), %ecx + movl 192+56(%rdi), %edx + movl %eax, 192+0(%rsp) + movl %ebx, 192+24(%rsp) + movl %ecx, 192+32(%rsp) + movl %edx, 192+56(%rsp) + movl 256+60(%rdi), %eax + movl 256+44(%rdi), %ebx + movl 256+28(%rdi), %ecx + movl 256+12(%rdi), %edx + movl %eax, 256+12(%rsp) + movl %ebx, 256+28(%rsp) + movl %ecx, 256+44(%rsp) + movl %edx, 256+60(%rsp) + movl 256+40(%rdi), %eax + movl 256+8(%rdi), %ebx + movl 256+48(%rdi), %ecx + movl 256+16(%rdi), %edx + movl %eax, 256+8(%rsp) + movl %ebx, 256+40(%rsp) + movl %ecx, 256+16(%rsp) + movl %edx, 256+48(%rsp) + movl 256+20(%rdi), %eax + movl 256+4(%rdi), %ebx + movl 256+52(%rdi), %ecx + movl 256+36(%rdi), %edx + movl %eax, 256+4(%rsp) + movl %ebx, 256+20(%rsp) + movl %ecx, 256+36(%rsp) + movl %edx, 256+52(%rsp) + movl 256+0(%rdi), %eax + movl 256+24(%rdi), %ebx + movl 256+32(%rdi), %ecx + movl 256+56(%rdi), %edx + movl %eax, 256+0(%rsp) + movl %ebx, 256+24(%rsp) + movl %ecx, 256+32(%rsp) + movl %edx, 256+56(%rsp) + movl 320+60(%rdi), %eax + movl 320+44(%rdi), %ebx + movl 320+28(%rdi), %ecx + movl 320+12(%rdi), %edx + movl %eax, 320+12(%rsp) + movl %ebx, 320+28(%rsp) + movl %ecx, 320+44(%rsp) + movl %edx, 320+60(%rsp) + movl 320+40(%rdi), %eax + movl 320+8(%rdi), %ebx + movl 320+48(%rdi), %ecx + movl 320+16(%rdi), %edx + movl %eax, 320+8(%rsp) + movl %ebx, 320+40(%rsp) + movl %ecx, 320+16(%rsp) + movl %edx, 320+48(%rsp) + movl 320+20(%rdi), %eax + movl 320+4(%rdi), %ebx + movl 320+52(%rdi), %ecx + movl 320+36(%rdi), %edx + movl %eax, 320+4(%rsp) + movl %ebx, 320+20(%rsp) + movl %ecx, 320+36(%rsp) + movl %edx, 320+52(%rsp) + movl 320+0(%rdi), %eax + movl 320+24(%rdi), %ebx + movl 320+32(%rdi), %ecx + movl 320+56(%rdi), %edx + movl %eax, 320+0(%rsp) + movl %ebx, 320+24(%rsp) + movl %ecx, 320+32(%rsp) + movl %edx, 320+56(%rsp) + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax +scrypt_core_3way_xmm_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_xmm_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq %r8, %rcx + subq $1, %r8 +scrypt_core_3way_xmm_loop2: + movd %xmm0, %ebp movd %xmm8, %ebx movd %xmm12, %eax pxor 0(%rsp), %xmm0 @@ -2156,140 +9413,4237 @@ scrypt_core_3way_xmm_loop2: shll $7, %ebp andl %r8d, %ebx leaq 1(%rbx, %rbx, 2), %rbx - shll $7, %ebx - andl %r8d, %eax + shll $7, %ebx + andl %r8d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_xmm_loop2 + + movl 0+60(%rsp), %eax + movl 0+44(%rsp), %ebx + movl 0+28(%rsp), %ecx + movl 0+12(%rsp), %edx + movl %eax, 0+12(%rdi) + movl %ebx, 0+28(%rdi) + movl %ecx, 0+44(%rdi) + movl %edx, 0+60(%rdi) + movl 0+40(%rsp), %eax + movl 0+8(%rsp), %ebx + movl 0+48(%rsp), %ecx + movl 0+16(%rsp), %edx + movl %eax, 0+8(%rdi) + movl %ebx, 0+40(%rdi) + movl %ecx, 0+16(%rdi) + movl %edx, 0+48(%rdi) + movl 0+20(%rsp), %eax + movl 0+4(%rsp), %ebx + movl 0+52(%rsp), %ecx + movl 0+36(%rsp), %edx + movl %eax, 0+4(%rdi) + movl %ebx, 0+20(%rdi) + movl %ecx, 0+36(%rdi) + movl %edx, 0+52(%rdi) + movl 0+0(%rsp), %eax + movl 0+24(%rsp), %ebx + movl 0+32(%rsp), %ecx + movl 0+56(%rsp), %edx + movl %eax, 0+0(%rdi) + movl %ebx, 0+24(%rdi) + movl %ecx, 0+32(%rdi) + movl %edx, 0+56(%rdi) + movl 64+60(%rsp), %eax + movl 64+44(%rsp), %ebx + movl 64+28(%rsp), %ecx + movl 64+12(%rsp), %edx + movl %eax, 64+12(%rdi) + movl %ebx, 64+28(%rdi) + movl %ecx, 64+44(%rdi) + movl %edx, 64+60(%rdi) + movl 64+40(%rsp), %eax + movl 64+8(%rsp), %ebx + movl 64+48(%rsp), %ecx + movl 64+16(%rsp), %edx + movl %eax, 64+8(%rdi) + movl %ebx, 64+40(%rdi) + movl %ecx, 64+16(%rdi) + movl %edx, 64+48(%rdi) + movl 64+20(%rsp), %eax + movl 64+4(%rsp), %ebx + movl 64+52(%rsp), %ecx + movl 64+36(%rsp), %edx + movl %eax, 64+4(%rdi) + movl %ebx, 64+20(%rdi) + movl %ecx, 64+36(%rdi) + movl %edx, 64+52(%rdi) + movl 64+0(%rsp), %eax + movl 64+24(%rsp), %ebx + movl 64+32(%rsp), %ecx + movl 64+56(%rsp), %edx + movl %eax, 64+0(%rdi) + movl %ebx, 64+24(%rdi) + movl %ecx, 64+32(%rdi) + movl %edx, 64+56(%rdi) + movl 128+60(%rsp), %eax + movl 128+44(%rsp), %ebx + movl 128+28(%rsp), %ecx + movl 128+12(%rsp), %edx + movl %eax, 128+12(%rdi) + movl %ebx, 128+28(%rdi) + movl %ecx, 128+44(%rdi) + movl %edx, 128+60(%rdi) + movl 128+40(%rsp), %eax + movl 128+8(%rsp), %ebx + movl 128+48(%rsp), %ecx + movl 128+16(%rsp), %edx + movl %eax, 128+8(%rdi) + movl %ebx, 128+40(%rdi) + movl %ecx, 128+16(%rdi) + movl %edx, 128+48(%rdi) + movl 128+20(%rsp), %eax + movl 128+4(%rsp), %ebx + movl 128+52(%rsp), %ecx + movl 128+36(%rsp), %edx + movl %eax, 128+4(%rdi) + movl %ebx, 128+20(%rdi) + movl %ecx, 128+36(%rdi) + movl %edx, 128+52(%rdi) + movl 128+0(%rsp), %eax + movl 128+24(%rsp), %ebx + movl 128+32(%rsp), %ecx + movl 128+56(%rsp), %edx + movl %eax, 128+0(%rdi) + movl %ebx, 128+24(%rdi) + movl %ecx, 128+32(%rdi) + movl %edx, 128+56(%rdi) + movl 192+60(%rsp), %eax + movl 192+44(%rsp), %ebx + movl 192+28(%rsp), %ecx + movl 192+12(%rsp), %edx + movl %eax, 192+12(%rdi) + movl %ebx, 192+28(%rdi) + movl %ecx, 192+44(%rdi) + movl %edx, 192+60(%rdi) + movl 192+40(%rsp), %eax + movl 192+8(%rsp), %ebx + movl 192+48(%rsp), %ecx + movl 192+16(%rsp), %edx + movl %eax, 192+8(%rdi) + movl %ebx, 192+40(%rdi) + movl %ecx, 192+16(%rdi) + movl %edx, 192+48(%rdi) + movl 192+20(%rsp), %eax + movl 192+4(%rsp), %ebx + movl 192+52(%rsp), %ecx + movl 192+36(%rsp), %edx + movl %eax, 192+4(%rdi) + movl %ebx, 192+20(%rdi) + movl %ecx, 192+36(%rdi) + movl %edx, 192+52(%rdi) + movl 192+0(%rsp), %eax + movl 192+24(%rsp), %ebx + movl 192+32(%rsp), %ecx + movl 192+56(%rsp), %edx + movl %eax, 192+0(%rdi) + movl %ebx, 192+24(%rdi) + movl %ecx, 192+32(%rdi) + movl %edx, 192+56(%rdi) + movl 256+60(%rsp), %eax + movl 256+44(%rsp), %ebx + movl 256+28(%rsp), %ecx + movl 256+12(%rsp), %edx + movl %eax, 256+12(%rdi) + movl %ebx, 256+28(%rdi) + movl %ecx, 256+44(%rdi) + movl %edx, 256+60(%rdi) + movl 256+40(%rsp), %eax + movl 256+8(%rsp), %ebx + movl 256+48(%rsp), %ecx + movl 256+16(%rsp), %edx + movl %eax, 256+8(%rdi) + movl %ebx, 256+40(%rdi) + movl %ecx, 256+16(%rdi) + movl %edx, 256+48(%rdi) + movl 256+20(%rsp), %eax + movl 256+4(%rsp), %ebx + movl 256+52(%rsp), %ecx + movl 256+36(%rsp), %edx + movl %eax, 256+4(%rdi) + movl %ebx, 256+20(%rdi) + movl %ecx, 256+36(%rdi) + movl %edx, 256+52(%rdi) + movl 256+0(%rsp), %eax + movl 256+24(%rsp), %ebx + movl 256+32(%rsp), %ecx + movl 256+56(%rsp), %edx + movl %eax, 256+0(%rdi) + movl %ebx, 256+24(%rdi) + movl %ecx, 256+32(%rdi) + movl %edx, 256+56(%rdi) + movl 320+60(%rsp), %eax + movl 320+44(%rsp), %ebx + movl 320+28(%rsp), %ecx + movl 320+12(%rsp), %edx + movl %eax, 320+12(%rdi) + movl %ebx, 320+28(%rdi) + movl %ecx, 320+44(%rdi) + movl %edx, 320+60(%rdi) + movl 320+40(%rsp), %eax + movl 320+8(%rsp), %ebx + movl 320+48(%rsp), %ecx + movl 320+16(%rsp), %edx + movl %eax, 320+8(%rdi) + movl %ebx, 320+40(%rdi) + movl %ecx, 320+16(%rdi) + movl %edx, 320+48(%rdi) + movl 320+20(%rsp), %eax + movl 320+4(%rsp), %ebx + movl 320+52(%rsp), %ecx + movl 320+36(%rsp), %edx + movl %eax, 320+4(%rdi) + movl %ebx, 320+20(%rdi) + movl %ecx, 320+36(%rdi) + movl %edx, 320+52(%rdi) + movl 320+0(%rsp), %eax + movl 320+24(%rsp), %ebx + movl 320+32(%rsp), %ecx + movl 320+56(%rsp), %edx + movl %eax, 320+0(%rdi) + movl %ebx, 320+24(%rdi) + movl %ecx, 320+32(%rdi) + movl %edx, 320+56(%rdi) + + addq $392, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx + ret + + +#if defined(USE_AVX2) + + + + .text + .p2align 6 + .globl scrypt_core_6way + .globl _scrypt_core_6way +scrypt_core_6way: +_scrypt_core_6way: + pushq %rbx + pushq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + vmovdqa %xmm6, 8(%rsp) + vmovdqa %xmm7, 24(%rsp) + vmovdqa %xmm8, 40(%rsp) + vmovdqa %xmm9, 56(%rsp) + vmovdqa %xmm10, 72(%rsp) + vmovdqa %xmm11, 88(%rsp) + vmovdqa %xmm12, 104(%rsp) + vmovdqa %xmm13, 120(%rsp) + vmovdqa %xmm14, 136(%rsp) + vmovdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#else + movq %rdx, %r8 +#endif + movq %rsp, %rdx + subq $768, %rsp + andq $-128, %rsp + + + + +scrypt_core_6way_avx2: + vmovdqa 0*256+0+0*16(%rdi), %xmm0 + vmovdqa 0*256+0+1*16(%rdi), %xmm1 + vmovdqa 0*256+0+2*16(%rdi), %xmm2 + vmovdqa 0*256+0+3*16(%rdi), %xmm3 + vinserti128 $1, 0*256+0+128+0*16(%rdi), %ymm0, %ymm0 + vinserti128 $1, 0*256+0+128+1*16(%rdi), %ymm1, %ymm1 + vinserti128 $1, 0*256+0+128+2*16(%rdi), %ymm2, %ymm2 + vinserti128 $1, 0*256+0+128+3*16(%rdi), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, 0*128+0*32(%rsp) + vmovdqa %ymm1, 0*128+1*32(%rsp) + vmovdqa %ymm2, 0*128+2*32(%rsp) + vmovdqa %ymm3, 0*128+3*32(%rsp) + vmovdqa 0*256+64+0*16(%rdi), %xmm0 + vmovdqa 0*256+64+1*16(%rdi), %xmm1 + vmovdqa 0*256+64+2*16(%rdi), %xmm2 + vmovdqa 0*256+64+3*16(%rdi), %xmm3 + vinserti128 $1, 0*256+64+128+0*16(%rdi), %ymm0, %ymm0 + vinserti128 $1, 0*256+64+128+1*16(%rdi), %ymm1, %ymm1 + vinserti128 $1, 0*256+64+128+2*16(%rdi), %ymm2, %ymm2 + vinserti128 $1, 0*256+64+128+3*16(%rdi), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, 1*128+0*32(%rsp) + vmovdqa %ymm1, 1*128+1*32(%rsp) + vmovdqa %ymm2, 1*128+2*32(%rsp) + vmovdqa %ymm3, 1*128+3*32(%rsp) + vmovdqa 1*256+0+0*16(%rdi), %xmm0 + vmovdqa 1*256+0+1*16(%rdi), %xmm1 + vmovdqa 1*256+0+2*16(%rdi), %xmm2 + vmovdqa 1*256+0+3*16(%rdi), %xmm3 + vinserti128 $1, 1*256+0+128+0*16(%rdi), %ymm0, %ymm0 + vinserti128 $1, 1*256+0+128+1*16(%rdi), %ymm1, %ymm1 + vinserti128 $1, 1*256+0+128+2*16(%rdi), %ymm2, %ymm2 + vinserti128 $1, 1*256+0+128+3*16(%rdi), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, 2*128+0*32(%rsp) + vmovdqa %ymm1, 2*128+1*32(%rsp) + vmovdqa %ymm2, 2*128+2*32(%rsp) + vmovdqa %ymm3, 2*128+3*32(%rsp) + vmovdqa 1*256+64+0*16(%rdi), %xmm0 + vmovdqa 1*256+64+1*16(%rdi), %xmm1 + vmovdqa 1*256+64+2*16(%rdi), %xmm2 + vmovdqa 1*256+64+3*16(%rdi), %xmm3 + vinserti128 $1, 1*256+64+128+0*16(%rdi), %ymm0, %ymm0 + vinserti128 $1, 1*256+64+128+1*16(%rdi), %ymm1, %ymm1 + vinserti128 $1, 1*256+64+128+2*16(%rdi), %ymm2, %ymm2 + vinserti128 $1, 1*256+64+128+3*16(%rdi), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, 3*128+0*32(%rsp) + vmovdqa %ymm1, 3*128+1*32(%rsp) + vmovdqa %ymm2, 3*128+2*32(%rsp) + vmovdqa %ymm3, 3*128+3*32(%rsp) + vmovdqa 2*256+0+0*16(%rdi), %xmm0 + vmovdqa 2*256+0+1*16(%rdi), %xmm1 + vmovdqa 2*256+0+2*16(%rdi), %xmm2 + vmovdqa 2*256+0+3*16(%rdi), %xmm3 + vinserti128 $1, 2*256+0+128+0*16(%rdi), %ymm0, %ymm0 + vinserti128 $1, 2*256+0+128+1*16(%rdi), %ymm1, %ymm1 + vinserti128 $1, 2*256+0+128+2*16(%rdi), %ymm2, %ymm2 + vinserti128 $1, 2*256+0+128+3*16(%rdi), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, 4*128+0*32(%rsp) + vmovdqa %ymm1, 4*128+1*32(%rsp) + vmovdqa %ymm2, 4*128+2*32(%rsp) + vmovdqa %ymm3, 4*128+3*32(%rsp) + vmovdqa 2*256+64+0*16(%rdi), %xmm0 + vmovdqa 2*256+64+1*16(%rdi), %xmm1 + vmovdqa 2*256+64+2*16(%rdi), %xmm2 + vmovdqa 2*256+64+3*16(%rdi), %xmm3 + vinserti128 $1, 2*256+64+128+0*16(%rdi), %ymm0, %ymm0 + vinserti128 $1, 2*256+64+128+1*16(%rdi), %ymm1, %ymm1 + vinserti128 $1, 2*256+64+128+2*16(%rdi), %ymm2, %ymm2 + vinserti128 $1, 2*256+64+128+3*16(%rdi), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, 5*128+0*32(%rsp) + vmovdqa %ymm1, 5*128+1*32(%rsp) + vmovdqa %ymm2, 5*128+2*32(%rsp) + vmovdqa %ymm3, 5*128+3*32(%rsp) + + vmovdqa 0*256+4*32(%rsp), %ymm0 + vmovdqa 0*256+5*32(%rsp), %ymm1 + vmovdqa 0*256+6*32(%rsp), %ymm2 + vmovdqa 0*256+7*32(%rsp), %ymm3 + vmovdqa 1*256+4*32(%rsp), %ymm8 + vmovdqa 1*256+5*32(%rsp), %ymm9 + vmovdqa 1*256+6*32(%rsp), %ymm10 + vmovdqa 1*256+7*32(%rsp), %ymm11 + vmovdqa 2*256+4*32(%rsp), %ymm12 + vmovdqa 2*256+5*32(%rsp), %ymm13 + vmovdqa 2*256+6*32(%rsp), %ymm14 + vmovdqa 2*256+7*32(%rsp), %ymm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $8, %rax + addq %rsi, %rax +scrypt_core_6way_avx2_loop1: + vmovdqa %ymm0, 0*256+4*32(%rbx) + vmovdqa %ymm1, 0*256+5*32(%rbx) + vmovdqa %ymm2, 0*256+6*32(%rbx) + vmovdqa %ymm3, 0*256+7*32(%rbx) + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vmovdqa %ymm8, 1*256+4*32(%rbx) + vmovdqa %ymm9, 1*256+5*32(%rbx) + vmovdqa %ymm10, 1*256+6*32(%rbx) + vmovdqa %ymm11, 1*256+7*32(%rbx) + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vmovdqa %ymm12, 2*256+4*32(%rbx) + vmovdqa %ymm13, 2*256+5*32(%rbx) + vmovdqa %ymm14, 2*256+6*32(%rbx) + vmovdqa %ymm15, 2*256+7*32(%rbx) + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rbx) + vmovdqa %ymm1, 0*256+1*32(%rbx) + vmovdqa %ymm2, 0*256+2*32(%rbx) + vmovdqa %ymm3, 0*256+3*32(%rbx) + vmovdqa %ymm8, 1*256+0*32(%rbx) + vmovdqa %ymm9, 1*256+1*32(%rbx) + vmovdqa %ymm10, 1*256+2*32(%rbx) + vmovdqa %ymm11, 1*256+3*32(%rbx) + vmovdqa %ymm12, 2*256+0*32(%rbx) + vmovdqa %ymm13, 2*256+1*32(%rbx) + vmovdqa %ymm14, 2*256+2*32(%rbx) + vmovdqa %ymm15, 2*256+3*32(%rbx) + + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vpxor 0*256+4*32(%rbx), %ymm0, %ymm0 + vpxor 0*256+5*32(%rbx), %ymm1, %ymm1 + vpxor 0*256+6*32(%rbx), %ymm2, %ymm2 + vpxor 0*256+7*32(%rbx), %ymm3, %ymm3 + vpxor 1*256+4*32(%rbx), %ymm8, %ymm8 + vpxor 1*256+5*32(%rbx), %ymm9, %ymm9 + vpxor 1*256+6*32(%rbx), %ymm10, %ymm10 + vpxor 1*256+7*32(%rbx), %ymm11, %ymm11 + vpxor 2*256+4*32(%rbx), %ymm12, %ymm12 + vpxor 2*256+5*32(%rbx), %ymm13, %ymm13 + vpxor 2*256+6*32(%rbx), %ymm14, %ymm14 + vpxor 2*256+7*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + + addq $6*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_6way_avx2_loop1 + + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + movq %r8, %rcx + leaq -1(%r8), %r11 +scrypt_core_6way_avx2_loop2: + vmovd %xmm0, %ebp + vmovd %xmm8, %ebx + vmovd %xmm12, %eax + vextracti128 $1, %ymm0, %xmm4 + vextracti128 $1, %ymm8, %xmm5 + vextracti128 $1, %ymm12, %xmm6 + vmovd %xmm4, %r8d + vmovd %xmm5, %r9d + vmovd %xmm6, %r10d + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + andl %r11d, %ebp + leaq 0(%rbp, %rbp, 2), %rbp + shll $8, %ebp + andl %r11d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $8, %ebx + andl %r11d, %eax leaq 2(%rax, %rax, 2), %rax - shll $7, %eax - pxor 0(%rsi, %rbp), %xmm0 - pxor 16(%rsi, %rbp), %xmm1 - pxor 32(%rsi, %rbp), %xmm2 - pxor 48(%rsi, %rbp), %xmm3 - pxor 0(%rsi, %rbx), %xmm8 - pxor 16(%rsi, %rbx), %xmm9 - pxor 32(%rsi, %rbx), %xmm10 - pxor 48(%rsi, %rbx), %xmm11 - pxor 0(%rsi, %rax), %xmm12 - pxor 16(%rsi, %rax), %xmm13 - pxor 32(%rsi, %rax), %xmm14 - pxor 48(%rsi, %rax), %xmm15 + shll $8, %eax + andl %r11d, %r8d + leaq 0(%r8, %r8, 2), %r8 + shll $8, %r8d + andl %r11d, %r9d + leaq 1(%r9, %r9, 2), %r9 + shll $8, %r9d + andl %r11d, %r10d + leaq 2(%r10, %r10, 2), %r10 + shll $8, %r10d + vmovdqa 0*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 0*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 0*32(%rsi, %rax), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rax), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rax), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rax), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_xmm - paddd 0(%rsp), %xmm0 - paddd 16(%rsp), %xmm1 - paddd 32(%rsp), %xmm2 - paddd 48(%rsp), %xmm3 - paddd 128+0(%rsp), %xmm8 - paddd 128+16(%rsp), %xmm9 - paddd 128+32(%rsp), %xmm10 - paddd 128+48(%rsp), %xmm11 - paddd 256+0(%rsp), %xmm12 - paddd 256+16(%rsp), %xmm13 - paddd 256+32(%rsp), %xmm14 - paddd 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) + vmovdqa 4*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 4*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 4*32(%rsi, %rax), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rax), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rax), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rax), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + vpxor 0*256+4*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+5*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+6*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+7*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+4*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+5*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+6*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+7*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+4*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+5*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+6*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 - pxor 64(%rsi, %rbp), %xmm0 - pxor 80(%rsi, %rbp), %xmm1 - pxor 96(%rsi, %rbp), %xmm2 - pxor 112(%rsi, %rbp), %xmm3 - pxor 64(%rsi, %rbx), %xmm8 - pxor 80(%rsi, %rbx), %xmm9 - pxor 96(%rsi, %rbx), %xmm10 - pxor 112(%rsi, %rbx), %xmm11 - pxor 64(%rsi, %rax), %xmm12 - pxor 80(%rsi, %rax), %xmm13 - pxor 96(%rsi, %rax), %xmm14 - pxor 112(%rsi, %rax), %xmm15 - pxor 64(%rsp), %xmm0 - pxor 80(%rsp), %xmm1 - pxor 96(%rsp), %xmm2 - pxor 112(%rsp), %xmm3 - pxor 128+64(%rsp), %xmm8 - pxor 128+80(%rsp), %xmm9 - pxor 128+96(%rsp), %xmm10 - pxor 128+112(%rsp), %xmm11 - pxor 256+64(%rsp), %xmm12 - pxor 256+80(%rsp), %xmm13 - pxor 256+96(%rsp), %xmm14 - pxor 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xmm - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 - subq $1, %rcx - ja scrypt_core_3way_xmm_loop2 + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 - scrypt_core_3way_cleanup - ret - - -#if defined(USE_AVX2) - -.macro salsa8_core_6way_avx2_doubleround + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 vpaddd %ymm0, %ymm1, %ymm4 vpaddd %ymm8, %ymm9, %ymm6 vpaddd %ymm12, %ymm13, %ymm7 @@ -2435,435 +13789,296 @@ scrypt_core_3way_xmm_loop2: vpshufd $0x39, %ymm15, %ymm15 vpxor %ymm5, %ymm12, %ymm12 vpxor %ymm7, %ymm12, %ymm12 -.endm - -.macro salsa8_core_6way_avx2 - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround -.endm - - .text - .p2align 6 - .globl scrypt_core_6way - .globl _scrypt_core_6way -scrypt_core_6way: -_scrypt_core_6way: - pushq %rbx - pushq %rbp -#if defined(_WIN64) || defined(__CYGWIN__) - subq $176, %rsp - vmovdqa %xmm6, 8(%rsp) - vmovdqa %xmm7, 24(%rsp) - vmovdqa %xmm8, 40(%rsp) - vmovdqa %xmm9, 56(%rsp) - vmovdqa %xmm10, 72(%rsp) - vmovdqa %xmm11, 88(%rsp) - vmovdqa %xmm12, 104(%rsp) - vmovdqa %xmm13, 120(%rsp) - vmovdqa %xmm14, 136(%rsp) - vmovdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#else - movq %rdx, %r8 -#endif - movq %rsp, %rdx - subq $768, %rsp - andq $-128, %rsp - -.macro scrypt_core_6way_cleanup - movq %rdx, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - popq %rdi - vmovdqa 8(%rsp), %xmm6 - vmovdqa 24(%rsp), %xmm7 - vmovdqa 40(%rsp), %xmm8 - vmovdqa 56(%rsp), %xmm9 - vmovdqa 72(%rsp), %xmm10 - vmovdqa 88(%rsp), %xmm11 - vmovdqa 104(%rsp), %xmm12 - vmovdqa 120(%rsp), %xmm13 - vmovdqa 136(%rsp), %xmm14 - vmovdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %rbp - popq %rbx -.endm - -.macro scrypt_shuffle_pack2 src, so, dest, do - vmovdqa \so+0*16(\src), %xmm0 - vmovdqa \so+1*16(\src), %xmm1 - vmovdqa \so+2*16(\src), %xmm2 - vmovdqa \so+3*16(\src), %xmm3 - vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0 - vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1 - vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2 - vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3 - vpblendd $0x33, %ymm0, %ymm2, %ymm4 - vpblendd $0xcc, %ymm1, %ymm3, %ymm5 - vpblendd $0x33, %ymm2, %ymm0, %ymm6 - vpblendd $0xcc, %ymm3, %ymm1, %ymm7 - vpblendd $0x55, %ymm7, %ymm6, %ymm3 - vpblendd $0x55, %ymm6, %ymm5, %ymm2 - vpblendd $0x55, %ymm5, %ymm4, %ymm1 - vpblendd $0x55, %ymm4, %ymm7, %ymm0 - vmovdqa %ymm0, \do+0*32(\dest) - vmovdqa %ymm1, \do+1*32(\dest) - vmovdqa %ymm2, \do+2*32(\dest) - vmovdqa %ymm3, \do+3*32(\dest) -.endm - -.macro scrypt_shuffle_unpack2 src, so, dest, do - vmovdqa \so+0*32(\src), %ymm0 - vmovdqa \so+1*32(\src), %ymm1 - vmovdqa \so+2*32(\src), %ymm2 - vmovdqa \so+3*32(\src), %ymm3 - vpblendd $0x33, %ymm0, %ymm2, %ymm4 - vpblendd $0xcc, %ymm1, %ymm3, %ymm5 - vpblendd $0x33, %ymm2, %ymm0, %ymm6 - vpblendd $0xcc, %ymm3, %ymm1, %ymm7 - vpblendd $0x55, %ymm7, %ymm6, %ymm3 - vpblendd $0x55, %ymm6, %ymm5, %ymm2 - vpblendd $0x55, %ymm5, %ymm4, %ymm1 - vpblendd $0x55, %ymm4, %ymm7, %ymm0 - vmovdqa %xmm0, \do+0*16(\dest) - vmovdqa %xmm1, \do+1*16(\dest) - vmovdqa %xmm2, \do+2*16(\dest) - vmovdqa %xmm3, \do+3*16(\dest) - vextracti128 $1, %ymm0, \do+128+0*16(\dest) - vextracti128 $1, %ymm1, \do+128+1*16(\dest) - vextracti128 $1, %ymm2, \do+128+2*16(\dest) - vextracti128 $1, %ymm3, \do+128+3*16(\dest) -.endm - -scrypt_core_6way_avx2: - scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128 - scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128 - scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128 - scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128 - scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128 - scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128 - - vmovdqa 0*256+4*32(%rsp), %ymm0 - vmovdqa 0*256+5*32(%rsp), %ymm1 - vmovdqa 0*256+6*32(%rsp), %ymm2 - vmovdqa 0*256+7*32(%rsp), %ymm3 - vmovdqa 1*256+4*32(%rsp), %ymm8 - vmovdqa 1*256+5*32(%rsp), %ymm9 - vmovdqa 1*256+6*32(%rsp), %ymm10 - vmovdqa 1*256+7*32(%rsp), %ymm11 - vmovdqa 2*256+4*32(%rsp), %ymm12 - vmovdqa 2*256+5*32(%rsp), %ymm13 - vmovdqa 2*256+6*32(%rsp), %ymm14 - vmovdqa 2*256+7*32(%rsp), %ymm15 - - movq %rsi, %rbx - leaq (%r8, %r8, 2), %rax - shlq $8, %rax - addq %rsi, %rax -scrypt_core_6way_avx2_loop1: - vmovdqa %ymm0, 0*256+4*32(%rbx) - vmovdqa %ymm1, 0*256+5*32(%rbx) - vmovdqa %ymm2, 0*256+6*32(%rbx) - vmovdqa %ymm3, 0*256+7*32(%rbx) - vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 - vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 - vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 - vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 - vmovdqa %ymm8, 1*256+4*32(%rbx) - vmovdqa %ymm9, 1*256+5*32(%rbx) - vmovdqa %ymm10, 1*256+6*32(%rbx) - vmovdqa %ymm11, 1*256+7*32(%rbx) - vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 - vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 - vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 - vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 - vmovdqa %ymm12, 2*256+4*32(%rbx) - vmovdqa %ymm13, 2*256+5*32(%rbx) - vmovdqa %ymm14, 2*256+6*32(%rbx) - vmovdqa %ymm15, 2*256+7*32(%rbx) - vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 - vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 - vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 - vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+0*32(%rbx) - vmovdqa %ymm1, 0*256+1*32(%rbx) - vmovdqa %ymm2, 0*256+2*32(%rbx) - vmovdqa %ymm3, 0*256+3*32(%rbx) - vmovdqa %ymm8, 1*256+0*32(%rbx) - vmovdqa %ymm9, 1*256+1*32(%rbx) - vmovdqa %ymm10, 1*256+2*32(%rbx) - vmovdqa %ymm11, 1*256+3*32(%rbx) - vmovdqa %ymm12, 2*256+0*32(%rbx) - vmovdqa %ymm13, 2*256+1*32(%rbx) - vmovdqa %ymm14, 2*256+2*32(%rbx) - vmovdqa %ymm15, 2*256+3*32(%rbx) - - salsa8_core_6way_avx2 - vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 - vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 - vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 - vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3 - vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8 - vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9 - vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10 - vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11 - vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12 - vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13 - vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14 - vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+0*32(%rsp) - vmovdqa %ymm1, 0*256+1*32(%rsp) - vmovdqa %ymm2, 0*256+2*32(%rsp) - vmovdqa %ymm3, 0*256+3*32(%rsp) - vmovdqa %ymm8, 1*256+0*32(%rsp) - vmovdqa %ymm9, 1*256+1*32(%rsp) - vmovdqa %ymm10, 1*256+2*32(%rsp) - vmovdqa %ymm11, 1*256+3*32(%rsp) - vmovdqa %ymm12, 2*256+0*32(%rsp) - vmovdqa %ymm13, 2*256+1*32(%rsp) - vmovdqa %ymm14, 2*256+2*32(%rsp) - vmovdqa %ymm15, 2*256+3*32(%rsp) - - vpxor 0*256+4*32(%rbx), %ymm0, %ymm0 - vpxor 0*256+5*32(%rbx), %ymm1, %ymm1 - vpxor 0*256+6*32(%rbx), %ymm2, %ymm2 - vpxor 0*256+7*32(%rbx), %ymm3, %ymm3 - vpxor 1*256+4*32(%rbx), %ymm8, %ymm8 - vpxor 1*256+5*32(%rbx), %ymm9, %ymm9 - vpxor 1*256+6*32(%rbx), %ymm10, %ymm10 - vpxor 1*256+7*32(%rbx), %ymm11, %ymm11 - vpxor 2*256+4*32(%rbx), %ymm12, %ymm12 - vpxor 2*256+5*32(%rbx), %ymm13, %ymm13 - vpxor 2*256+6*32(%rbx), %ymm14, %ymm14 - vpxor 2*256+7*32(%rbx), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) - salsa8_core_6way_avx2 - vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 - vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 - vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 - vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 - vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 - vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 - vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 - vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 - vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 - vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 - vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 - vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 - addq $6*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_6way_avx2_loop1 + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 - movq %r8, %rcx - leaq -1(%r8), %r11 -scrypt_core_6way_avx2_loop2: - vmovd %xmm0, %ebp - vmovd %xmm8, %ebx - vmovd %xmm12, %eax - vextracti128 $1, %ymm0, %xmm4 - vextracti128 $1, %ymm8, %xmm5 - vextracti128 $1, %ymm12, %xmm6 - vmovd %xmm4, %r8d - vmovd %xmm5, %r9d - vmovd %xmm6, %r10d - vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 - vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 - vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 - vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 - vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 - vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 - vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 - vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 - vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 - vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 - vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 - vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 - andl %r11d, %ebp - leaq 0(%rbp, %rbp, 2), %rbp - shll $8, %ebp - andl %r11d, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $8, %ebx - andl %r11d, %eax - leaq 2(%rax, %rax, 2), %rax - shll $8, %eax - andl %r11d, %r8d - leaq 0(%r8, %r8, 2), %r8 - shll $8, %r8d - andl %r11d, %r9d - leaq 1(%r9, %r9, 2), %r9 - shll $8, %r9d - andl %r11d, %r10d - leaq 2(%r10, %r10, 2), %r10 - shll $8, %r10d - vmovdqa 0*32(%rsi, %rbp), %xmm4 - vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4 - vmovdqa 1*32(%rsi, %rbp), %xmm5 - vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5 - vmovdqa 2*32(%rsi, %rbp), %xmm6 - vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6 - vmovdqa 3*32(%rsi, %rbp), %xmm7 - vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7 + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm6, %ymm2, %ymm2 - vpxor %ymm7, %ymm3, %ymm3 - vmovdqa 0*32(%rsi, %rbx), %xmm4 - vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4 - vmovdqa 1*32(%rsi, %rbx), %xmm5 - vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5 - vmovdqa 2*32(%rsi, %rbx), %xmm6 - vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6 - vmovdqa 3*32(%rsi, %rbx), %xmm7 - vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7 - vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 vpxor %ymm6, %ymm10, %ymm10 - vpxor %ymm7, %ymm11, %ymm11 - vmovdqa 0*32(%rsi, %rax), %xmm4 - vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4 - vmovdqa 1*32(%rsi, %rax), %xmm5 - vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5 - vmovdqa 2*32(%rsi, %rax), %xmm6 - vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6 - vmovdqa 3*32(%rsi, %rax), %xmm7 - vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7 - vpxor %ymm4, %ymm12, %ymm12 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm6, %ymm14, %ymm14 - vpxor %ymm7, %ymm15, %ymm15 - - vmovdqa %ymm0, 0*256+0*32(%rsp) - vmovdqa %ymm1, 0*256+1*32(%rsp) - vmovdqa %ymm2, 0*256+2*32(%rsp) - vmovdqa %ymm3, 0*256+3*32(%rsp) - vmovdqa %ymm8, 1*256+0*32(%rsp) - vmovdqa %ymm9, 1*256+1*32(%rsp) - vmovdqa %ymm10, 1*256+2*32(%rsp) - vmovdqa %ymm11, 1*256+3*32(%rsp) - vmovdqa %ymm12, 2*256+0*32(%rsp) - vmovdqa %ymm13, 2*256+1*32(%rsp) - vmovdqa %ymm14, 2*256+2*32(%rsp) - vmovdqa %ymm15, 2*256+3*32(%rsp) - salsa8_core_6way_avx2 - vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 - vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 - vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 - vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3 - vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8 - vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9 - vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10 - vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11 - vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12 - vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13 - vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14 - vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+0*32(%rsp) - vmovdqa %ymm1, 0*256+1*32(%rsp) - vmovdqa %ymm2, 0*256+2*32(%rsp) - vmovdqa %ymm3, 0*256+3*32(%rsp) - vmovdqa %ymm8, 1*256+0*32(%rsp) - vmovdqa %ymm9, 1*256+1*32(%rsp) - vmovdqa %ymm10, 1*256+2*32(%rsp) - vmovdqa %ymm11, 1*256+3*32(%rsp) - vmovdqa %ymm12, 2*256+0*32(%rsp) - vmovdqa %ymm13, 2*256+1*32(%rsp) - vmovdqa %ymm14, 2*256+2*32(%rsp) - vmovdqa %ymm15, 2*256+3*32(%rsp) + vpxor %ymm7, %ymm13, %ymm13 - vmovdqa 4*32(%rsi, %rbp), %xmm4 - vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4 - vmovdqa 5*32(%rsi, %rbp), %xmm5 - vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5 - vmovdqa 6*32(%rsi, %rbp), %xmm6 - vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6 - vmovdqa 7*32(%rsi, %rbp), %xmm7 - vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7 + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm6, %ymm2, %ymm2 - vpxor %ymm7, %ymm3, %ymm3 - vmovdqa 4*32(%rsi, %rbx), %xmm4 - vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4 - vmovdqa 5*32(%rsi, %rbx), %xmm5 - vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5 - vmovdqa 6*32(%rsi, %rbx), %xmm6 - vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6 - vmovdqa 7*32(%rsi, %rbx), %xmm7 - vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7 - vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm10, %ymm10 - vpxor %ymm7, %ymm11, %ymm11 - vmovdqa 4*32(%rsi, %rax), %xmm4 - vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4 - vmovdqa 5*32(%rsi, %rax), %xmm5 - vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5 - vmovdqa 6*32(%rsi, %rax), %xmm6 - vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6 - vmovdqa 7*32(%rsi, %rax), %xmm7 - vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7 - vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 vpxor %ymm7, %ymm15, %ymm15 - vpxor 0*256+4*32(%rsp), %ymm0, %ymm0 - vpxor 0*256+5*32(%rsp), %ymm1, %ymm1 - vpxor 0*256+6*32(%rsp), %ymm2, %ymm2 - vpxor 0*256+7*32(%rsp), %ymm3, %ymm3 - vpxor 1*256+4*32(%rsp), %ymm8, %ymm8 - vpxor 1*256+5*32(%rsp), %ymm9, %ymm9 - vpxor 1*256+6*32(%rsp), %ymm10, %ymm10 - vpxor 1*256+7*32(%rsp), %ymm11, %ymm11 - vpxor 2*256+4*32(%rsp), %ymm12, %ymm12 - vpxor 2*256+5*32(%rsp), %ymm13, %ymm13 - vpxor 2*256+6*32(%rsp), %ymm14, %ymm14 - vpxor 2*256+7*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) - salsa8_core_6way_avx2 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 @@ -2892,14 +14107,145 @@ scrypt_core_6way_avx2_loop2: subq $1, %rcx ja scrypt_core_6way_avx2_loop2 - scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0 - scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64 - scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0 - scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64 - scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0 - scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64 + vmovdqa 0*128+0*32(%rsp), %ymm0 + vmovdqa 0*128+1*32(%rsp), %ymm1 + vmovdqa 0*128+2*32(%rsp), %ymm2 + vmovdqa 0*128+3*32(%rsp), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, 0*256+0+0*16(%rdi) + vmovdqa %xmm1, 0*256+0+1*16(%rdi) + vmovdqa %xmm2, 0*256+0+2*16(%rdi) + vmovdqa %xmm3, 0*256+0+3*16(%rdi) + vextracti128 $1, %ymm0, 0*256+0+128+0*16(%rdi) + vextracti128 $1, %ymm1, 0*256+0+128+1*16(%rdi) + vextracti128 $1, %ymm2, 0*256+0+128+2*16(%rdi) + vextracti128 $1, %ymm3, 0*256+0+128+3*16(%rdi) + vmovdqa 1*128+0*32(%rsp), %ymm0 + vmovdqa 1*128+1*32(%rsp), %ymm1 + vmovdqa 1*128+2*32(%rsp), %ymm2 + vmovdqa 1*128+3*32(%rsp), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, 0*256+64+0*16(%rdi) + vmovdqa %xmm1, 0*256+64+1*16(%rdi) + vmovdqa %xmm2, 0*256+64+2*16(%rdi) + vmovdqa %xmm3, 0*256+64+3*16(%rdi) + vextracti128 $1, %ymm0, 0*256+64+128+0*16(%rdi) + vextracti128 $1, %ymm1, 0*256+64+128+1*16(%rdi) + vextracti128 $1, %ymm2, 0*256+64+128+2*16(%rdi) + vextracti128 $1, %ymm3, 0*256+64+128+3*16(%rdi) + vmovdqa 2*128+0*32(%rsp), %ymm0 + vmovdqa 2*128+1*32(%rsp), %ymm1 + vmovdqa 2*128+2*32(%rsp), %ymm2 + vmovdqa 2*128+3*32(%rsp), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, 1*256+0+0*16(%rdi) + vmovdqa %xmm1, 1*256+0+1*16(%rdi) + vmovdqa %xmm2, 1*256+0+2*16(%rdi) + vmovdqa %xmm3, 1*256+0+3*16(%rdi) + vextracti128 $1, %ymm0, 1*256+0+128+0*16(%rdi) + vextracti128 $1, %ymm1, 1*256+0+128+1*16(%rdi) + vextracti128 $1, %ymm2, 1*256+0+128+2*16(%rdi) + vextracti128 $1, %ymm3, 1*256+0+128+3*16(%rdi) + vmovdqa 3*128+0*32(%rsp), %ymm0 + vmovdqa 3*128+1*32(%rsp), %ymm1 + vmovdqa 3*128+2*32(%rsp), %ymm2 + vmovdqa 3*128+3*32(%rsp), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, 1*256+64+0*16(%rdi) + vmovdqa %xmm1, 1*256+64+1*16(%rdi) + vmovdqa %xmm2, 1*256+64+2*16(%rdi) + vmovdqa %xmm3, 1*256+64+3*16(%rdi) + vextracti128 $1, %ymm0, 1*256+64+128+0*16(%rdi) + vextracti128 $1, %ymm1, 1*256+64+128+1*16(%rdi) + vextracti128 $1, %ymm2, 1*256+64+128+2*16(%rdi) + vextracti128 $1, %ymm3, 1*256+64+128+3*16(%rdi) + vmovdqa 4*128+0*32(%rsp), %ymm0 + vmovdqa 4*128+1*32(%rsp), %ymm1 + vmovdqa 4*128+2*32(%rsp), %ymm2 + vmovdqa 4*128+3*32(%rsp), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, 2*256+0+0*16(%rdi) + vmovdqa %xmm1, 2*256+0+1*16(%rdi) + vmovdqa %xmm2, 2*256+0+2*16(%rdi) + vmovdqa %xmm3, 2*256+0+3*16(%rdi) + vextracti128 $1, %ymm0, 2*256+0+128+0*16(%rdi) + vextracti128 $1, %ymm1, 2*256+0+128+1*16(%rdi) + vextracti128 $1, %ymm2, 2*256+0+128+2*16(%rdi) + vextracti128 $1, %ymm3, 2*256+0+128+3*16(%rdi) + vmovdqa 5*128+0*32(%rsp), %ymm0 + vmovdqa 5*128+1*32(%rsp), %ymm1 + vmovdqa 5*128+2*32(%rsp), %ymm2 + vmovdqa 5*128+3*32(%rsp), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, 2*256+64+0*16(%rdi) + vmovdqa %xmm1, 2*256+64+1*16(%rdi) + vmovdqa %xmm2, 2*256+64+2*16(%rdi) + vmovdqa %xmm3, 2*256+64+3*16(%rdi) + vextracti128 $1, %ymm0, 2*256+64+128+0*16(%rdi) + vextracti128 $1, %ymm1, 2*256+64+128+1*16(%rdi) + vextracti128 $1, %ymm2, 2*256+64+128+2*16(%rdi) + vextracti128 $1, %ymm3, 2*256+64+128+3*16(%rdi) - scrypt_core_6way_cleanup + movq %rdx, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + vmovdqa 8(%rsp), %xmm6 + vmovdqa 24(%rsp), %xmm7 + vmovdqa 40(%rsp), %xmm8 + vmovdqa 56(%rsp), %xmm9 + vmovdqa 72(%rsp), %xmm10 + vmovdqa 88(%rsp), %xmm11 + vmovdqa 104(%rsp), %xmm12 + vmovdqa 120(%rsp), %xmm13 + vmovdqa 136(%rsp), %xmm14 + vmovdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx ret #endif /* USE_AVX2 */ diff --git a/scrypt-x64.S.orig b/scrypt-x64.S.orig new file mode 100644 index 000000000..f9185d490 --- /dev/null +++ b/scrypt-x64.S.orig @@ -0,0 +1,2907 @@ +/* + * Copyright 2011-2014 pooler@litecoinpool.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(USE_ASM) && defined(__x86_64__) + + .text + .p2align 6 + .globl scrypt_best_throughput + .globl _scrypt_best_throughput +scrypt_best_throughput: +_scrypt_best_throughput: + pushq %rbx +#if defined(USE_AVX2) + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne scrypt_best_throughput_no_avx2 + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne scrypt_best_throughput_no_avx2 + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne scrypt_best_throughput_no_avx2 + movl $6, %eax + jmp scrypt_best_throughput_exit +scrypt_best_throughput_no_avx2: +#endif + /* Check for AuthenticAMD */ + xorq %rax, %rax + cpuid + movl $3, %eax + cmpl $0x444d4163, %ecx + jne scrypt_best_throughput_not_amd + cmpl $0x69746e65, %edx + jne scrypt_best_throughput_not_amd + cmpl $0x68747541, %ebx + jne scrypt_best_throughput_not_amd + /* Check for AMD K8 or Bobcat */ + movl $1, %eax + cpuid + andl $0x0ff00000, %eax + jz scrypt_best_throughput_one + cmpl $0x00500000, %eax + je scrypt_best_throughput_one + movl $3, %eax + jmp scrypt_best_throughput_exit +scrypt_best_throughput_not_amd: + /* Check for GenuineIntel */ + cmpl $0x6c65746e, %ecx + jne scrypt_best_throughput_exit + cmpl $0x49656e69, %edx + jne scrypt_best_throughput_exit + cmpl $0x756e6547, %ebx + jne scrypt_best_throughput_exit + /* Check for Intel Atom */ + movl $1, %eax + cpuid + movl %eax, %edx + andl $0x0ff00f00, %eax + cmpl $0x00000600, %eax + movl $3, %eax + jnz scrypt_best_throughput_exit + andl $0x000f00f0, %edx + cmpl $0x000100c0, %edx + je scrypt_best_throughput_one + cmpl $0x00020060, %edx + je scrypt_best_throughput_one + cmpl $0x00030060, %edx + jne scrypt_best_throughput_exit +scrypt_best_throughput_one: + movl $1, %eax +scrypt_best_throughput_exit: + popq %rbx + ret + + +.macro scrypt_shuffle src, so, dest, do + movl \so+60(\src), %eax + movl \so+44(\src), %ebx + movl \so+28(\src), %ecx + movl \so+12(\src), %edx + movl %eax, \do+12(\dest) + movl %ebx, \do+28(\dest) + movl %ecx, \do+44(\dest) + movl %edx, \do+60(\dest) + movl \so+40(\src), %eax + movl \so+8(\src), %ebx + movl \so+48(\src), %ecx + movl \so+16(\src), %edx + movl %eax, \do+8(\dest) + movl %ebx, \do+40(\dest) + movl %ecx, \do+16(\dest) + movl %edx, \do+48(\dest) + movl \so+20(\src), %eax + movl \so+4(\src), %ebx + movl \so+52(\src), %ecx + movl \so+36(\src), %edx + movl %eax, \do+4(\dest) + movl %ebx, \do+20(\dest) + movl %ecx, \do+36(\dest) + movl %edx, \do+52(\dest) + movl \so+0(\src), %eax + movl \so+24(\src), %ebx + movl \so+32(\src), %ecx + movl \so+56(\src), %edx + movl %eax, \do+0(\dest) + movl %ebx, \do+24(\dest) + movl %ecx, \do+32(\dest) + movl %edx, \do+56(\dest) +.endm + + +.macro salsa8_core_gen_doubleround + movq 72(%rsp), %r15 + + leaq (%r14, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %r9d + leaq (%rdi, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r10d + leaq (%rdx, %r9), %rbp + roll $9, %ebp + xorl %ebp, %r11d + leaq (%r15, %r10), %rbp + roll $9, %ebp + xorl %ebp, %r13d + + leaq (%r9, %r11), %rbp + roll $13, %ebp + xorl %ebp, %r14d + leaq (%r10, %r13), %rbp + roll $13, %ebp + xorl %ebp, %edi + leaq (%r11, %r14), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r13, %rdi), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) + + leaq (%rax, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %ebx + leaq (%rbp, %rbx), %r15 + roll $9, %r15d + xorl %r15d, %ecx + leaq (%rbx, %rcx), %r15 + roll $13, %r15d + xorl %r15d, %eax + leaq (%rcx, %rax), %r15 + roll $18, %r15d + xorl %r15d, %ebp + + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) + + leaq (%r12, %r15), %rbp + roll $7, %ebp + xorl %ebp, %esi + leaq (%r15, %rsi), %rbp + roll $9, %ebp + xorl %ebp, %r8d + leaq (%rsi, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r12d + leaq (%r8, %r12), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 + + leaq (%rsi, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %edi + leaq (%r9, %r15), %rbp + roll $7, %ebp + xorl %ebp, %eax + leaq (%rdx, %rdi), %rbp + roll $9, %ebp + xorl %ebp, %ecx + leaq (%r15, %rax), %rbp + roll $9, %ebp + xorl %ebp, %r8d + + leaq (%rdi, %rcx), %rbp + roll $13, %ebp + xorl %ebp, %esi + leaq (%rax, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r9d + leaq (%rcx, %rsi), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r8, %r9), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) + + leaq (%r10, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %r12d + leaq (%rbp, %r12), %r15 + roll $9, %r15d + xorl %r15d, %r11d + leaq (%r12, %r11), %r15 + roll $13, %r15d + xorl %r15d, %r10d + leaq (%r11, %r10), %r15 + roll $18, %r15d + xorl %r15d, %ebp + + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) + + leaq (%rbx, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r14d + leaq (%r15, %r14), %rbp + roll $9, %ebp + xorl %ebp, %r13d + leaq (%r14, %r13), %rbp + roll $13, %ebp + xorl %ebp, %ebx + leaq (%r13, %rbx), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq %r15, 88(%rsp) +.endm + + .text + .p2align 6 +salsa8_core_gen: + /* 0: %rdx, %rdi, %rcx, %rsi */ + movq 8(%rsp), %rdi + movq %rdi, %rdx + shrq $32, %rdi + movq 16(%rsp), %rsi + movq %rsi, %rcx + shrq $32, %rsi + /* 1: %r9, 72(%rsp), %rax, %r8 */ + movq 24(%rsp), %r8 + movq %r8, %r9 + shrq $32, %r8 + movq %r8, 72(%rsp) + movq 32(%rsp), %r8 + movq %r8, %rax + shrq $32, %r8 + /* 2: %r11, %r10, 48(%rsp), %r12 */ + movq 40(%rsp), %r10 + movq %r10, %r11 + shrq $32, %r10 + movq 48(%rsp), %r12 + /* movq %r12, %r13 */ + /* movq %r13, 48(%rsp) */ + shrq $32, %r12 + /* 3: %r14, %r13, %rbx, 88(%rsp) */ + movq 56(%rsp), %r13 + movq %r13, %r14 + shrq $32, %r13 + movq 64(%rsp), %r15 + movq %r15, %rbx + shrq $32, %r15 + movq %r15, 88(%rsp) + + salsa8_core_gen_doubleround + salsa8_core_gen_doubleround + salsa8_core_gen_doubleround + salsa8_core_gen_doubleround + + shlq $32, %rdi + xorq %rdi, %rdx + movq %rdx, 24(%rsp) + + shlq $32, %rsi + xorq %rsi, %rcx + movq %rcx, 32(%rsp) + + movl 72(%rsp), %edi + shlq $32, %rdi + xorq %rdi, %r9 + movq %r9, 40(%rsp) + + movl 48(%rsp), %ebp + shlq $32, %r8 + xorq %r8, %rax + movq %rax, 48(%rsp) + + shlq $32, %r10 + xorq %r10, %r11 + movq %r11, 56(%rsp) + + shlq $32, %r12 + xorq %r12, %rbp + movq %rbp, 64(%rsp) + + shlq $32, %r13 + xorq %r13, %r14 + movq %r14, 72(%rsp) + + movdqa 24(%rsp), %xmm0 + + shlq $32, %r15 + xorq %r15, %rbx + movq %rbx, 80(%rsp) + + movdqa 40(%rsp), %xmm1 + movdqa 56(%rsp), %xmm2 + movdqa 72(%rsp), %xmm3 + + ret + + + .text + .p2align 6 + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#else + movq %rdx, %r8 +#endif + +.macro scrypt_core_cleanup +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +.endm + + /* GenuineIntel processors have fast SIMD */ + xorl %eax, %eax + cpuid + cmpl $0x6c65746e, %ecx + jne scrypt_core_gen + cmpl $0x49656e69, %edx + jne scrypt_core_gen + cmpl $0x756e6547, %ebx + je scrypt_core_xmm + + .p2align 6 +scrypt_core_gen: + subq $136, %rsp + movdqa 0(%rdi), %xmm8 + movdqa 16(%rdi), %xmm9 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm11 + movdqa 64(%rdi), %xmm12 + movdqa 80(%rdi), %xmm13 + movdqa 96(%rdi), %xmm14 + movdqa 112(%rdi), %xmm15 + + movq %r8, %rcx + shlq $7, %rcx + addq %rsi, %rcx + movq %r8, 96(%rsp) + movq %rdi, 104(%rsp) + movq %rsi, 112(%rsp) + movq %rcx, 120(%rsp) +scrypt_core_gen_loop1: + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + movdqa %xmm12, 64(%rsi) + movdqa %xmm13, 80(%rsi) + movdqa %xmm14, 96(%rsi) + movdqa %xmm15, 112(%rsi) + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rsi, 128(%rsp) + call salsa8_core_gen + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, 0(%rsp) + movdqa %xmm13, 16(%rsp) + movdqa %xmm14, 32(%rsp) + movdqa %xmm15, 48(%rsp) + call salsa8_core_gen + movq 128(%rsp), %rsi + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + addq $128, %rsi + movq 120(%rsp), %rcx + cmpq %rcx, %rsi + jne scrypt_core_gen_loop1 + + movq 96(%rsp), %r8 + movq %r8, %rcx + subl $1, %r8d + movq %r8, 96(%rsp) + movd %xmm12, %edx +scrypt_core_gen_loop2: + movq 112(%rsp), %rsi + andl %r8d, %edx + shll $7, %edx + addq %rsi, %rdx + movdqa 0(%rdx), %xmm0 + movdqa 16(%rdx), %xmm1 + movdqa 32(%rdx), %xmm2 + movdqa 48(%rdx), %xmm3 + movdqa 64(%rdx), %xmm4 + movdqa 80(%rdx), %xmm5 + movdqa 96(%rdx), %xmm6 + movdqa 112(%rdx), %xmm7 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + pxor %xmm4, %xmm12 + pxor %xmm5, %xmm13 + pxor %xmm6, %xmm14 + pxor %xmm7, %xmm15 + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rcx, 128(%rsp) + call salsa8_core_gen + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, 0(%rsp) + movdqa %xmm13, 16(%rsp) + movdqa %xmm14, 32(%rsp) + movdqa %xmm15, 48(%rsp) + call salsa8_core_gen + movq 96(%rsp), %r8 + movq 128(%rsp), %rcx + addl 0(%rsp), %edx + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + subq $1, %rcx + ja scrypt_core_gen_loop2 + + movq 104(%rsp), %rdi + movdqa %xmm8, 0(%rdi) + movdqa %xmm9, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm11, 48(%rdi) + movdqa %xmm12, 64(%rdi) + movdqa %xmm13, 80(%rdi) + movdqa %xmm14, 96(%rdi) + movdqa %xmm15, 112(%rdi) + + addq $136, %rsp + scrypt_core_cleanup + ret + + +.macro salsa8_core_xmm_doubleround + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 +.endm + +.macro salsa8_core_xmm + salsa8_core_xmm_doubleround + salsa8_core_xmm_doubleround + salsa8_core_xmm_doubleround + salsa8_core_xmm_doubleround +.endm + + .p2align 6 +scrypt_core_xmm: + pcmpeqw %xmm1, %xmm1 + psrlq $32, %xmm1 + + movdqa 0(%rdi), %xmm8 + movdqa 16(%rdi), %xmm11 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm9 + movdqa %xmm8, %xmm0 + pxor %xmm11, %xmm8 + pand %xmm1, %xmm8 + pxor %xmm11, %xmm8 + pxor %xmm10, %xmm11 + pand %xmm1, %xmm11 + pxor %xmm10, %xmm11 + pxor %xmm9, %xmm10 + pand %xmm1, %xmm10 + pxor %xmm9, %xmm10 + pxor %xmm0, %xmm9 + pand %xmm1, %xmm9 + pxor %xmm0, %xmm9 + movdqa %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm10 + punpcklqdq %xmm10, %xmm8 + punpckhqdq %xmm0, %xmm10 + movdqa %xmm11, %xmm0 + pshufd $0x4e, %xmm9, %xmm9 + punpcklqdq %xmm9, %xmm11 + punpckhqdq %xmm0, %xmm9 + + movdqa 64(%rdi), %xmm12 + movdqa 80(%rdi), %xmm15 + movdqa 96(%rdi), %xmm14 + movdqa 112(%rdi), %xmm13 + movdqa %xmm12, %xmm0 + pxor %xmm15, %xmm12 + pand %xmm1, %xmm12 + pxor %xmm15, %xmm12 + pxor %xmm14, %xmm15 + pand %xmm1, %xmm15 + pxor %xmm14, %xmm15 + pxor %xmm13, %xmm14 + pand %xmm1, %xmm14 + pxor %xmm13, %xmm14 + pxor %xmm0, %xmm13 + pand %xmm1, %xmm13 + pxor %xmm0, %xmm13 + movdqa %xmm12, %xmm0 + pshufd $0x4e, %xmm14, %xmm14 + punpcklqdq %xmm14, %xmm12 + punpckhqdq %xmm0, %xmm14 + movdqa %xmm15, %xmm0 + pshufd $0x4e, %xmm13, %xmm13 + punpcklqdq %xmm13, %xmm15 + punpckhqdq %xmm0, %xmm13 + + movq %rsi, %rdx + movq %r8, %rcx + shlq $7, %rcx + addq %rsi, %rcx +scrypt_core_xmm_loop1: + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rdx) + movdqa %xmm9, 16(%rdx) + movdqa %xmm10, 32(%rdx) + movdqa %xmm11, 48(%rdx) + movdqa %xmm12, 64(%rdx) + movdqa %xmm13, 80(%rdx) + movdqa %xmm14, 96(%rdx) + movdqa %xmm15, 112(%rdx) + + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, %xmm0 + movdqa %xmm13, %xmm1 + movdqa %xmm14, %xmm2 + movdqa %xmm15, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + addq $128, %rdx + cmpq %rcx, %rdx + jne scrypt_core_xmm_loop1 + + movq %r8, %rcx + subl $1, %r8d +scrypt_core_xmm_loop2: + movd %xmm12, %edx + andl %r8d, %edx + shll $7, %edx + pxor 0(%rsi, %rdx), %xmm8 + pxor 16(%rsi, %rdx), %xmm9 + pxor 32(%rsi, %rdx), %xmm10 + pxor 48(%rsi, %rdx), %xmm11 + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor 64(%rsi, %rdx), %xmm12 + pxor 80(%rsi, %rdx), %xmm13 + pxor 96(%rsi, %rdx), %xmm14 + pxor 112(%rsi, %rdx), %xmm15 + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, %xmm0 + movdqa %xmm13, %xmm1 + movdqa %xmm14, %xmm2 + movdqa %xmm15, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + subq $1, %rcx + ja scrypt_core_xmm_loop2 + + pcmpeqw %xmm1, %xmm1 + psrlq $32, %xmm1 + + movdqa %xmm8, %xmm0 + pxor %xmm9, %xmm8 + pand %xmm1, %xmm8 + pxor %xmm9, %xmm8 + pxor %xmm10, %xmm9 + pand %xmm1, %xmm9 + pxor %xmm10, %xmm9 + pxor %xmm11, %xmm10 + pand %xmm1, %xmm10 + pxor %xmm11, %xmm10 + pxor %xmm0, %xmm11 + pand %xmm1, %xmm11 + pxor %xmm0, %xmm11 + movdqa %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm10 + punpcklqdq %xmm10, %xmm8 + punpckhqdq %xmm0, %xmm10 + movdqa %xmm9, %xmm0 + pshufd $0x4e, %xmm11, %xmm11 + punpcklqdq %xmm11, %xmm9 + punpckhqdq %xmm0, %xmm11 + movdqa %xmm8, 0(%rdi) + movdqa %xmm11, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm9, 48(%rdi) + + movdqa %xmm12, %xmm0 + pxor %xmm13, %xmm12 + pand %xmm1, %xmm12 + pxor %xmm13, %xmm12 + pxor %xmm14, %xmm13 + pand %xmm1, %xmm13 + pxor %xmm14, %xmm13 + pxor %xmm15, %xmm14 + pand %xmm1, %xmm14 + pxor %xmm15, %xmm14 + pxor %xmm0, %xmm15 + pand %xmm1, %xmm15 + pxor %xmm0, %xmm15 + movdqa %xmm12, %xmm0 + pshufd $0x4e, %xmm14, %xmm14 + punpcklqdq %xmm14, %xmm12 + punpckhqdq %xmm0, %xmm14 + movdqa %xmm13, %xmm0 + pshufd $0x4e, %xmm15, %xmm15 + punpcklqdq %xmm15, %xmm13 + punpckhqdq %xmm0, %xmm15 + movdqa %xmm12, 64(%rdi) + movdqa %xmm15, 80(%rdi) + movdqa %xmm14, 96(%rdi) + movdqa %xmm13, 112(%rdi) + + scrypt_core_cleanup + ret + + +#if defined(USE_AVX) +.macro salsa8_core_3way_avx_doubleround + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 +.endm + +.macro salsa8_core_3way_avx + salsa8_core_3way_avx_doubleround + salsa8_core_3way_avx_doubleround + salsa8_core_3way_avx_doubleround + salsa8_core_3way_avx_doubleround +.endm +#endif /* USE_AVX */ + + .text + .p2align 6 + .globl scrypt_core_3way + .globl _scrypt_core_3way +scrypt_core_3way: +_scrypt_core_3way: + pushq %rbx + pushq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#else + movq %rdx, %r8 +#endif + subq $392, %rsp + +.macro scrypt_core_3way_cleanup + addq $392, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx +.endm + +#if !defined(USE_AVX) + jmp scrypt_core_3way_xmm +#else + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne scrypt_core_3way_xmm + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne scrypt_core_3way_xmm +#if defined(USE_XOP) + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jnz scrypt_core_3way_xop +#endif + +scrypt_core_3way_avx: + scrypt_shuffle %rdi, 0, %rsp, 0 + scrypt_shuffle %rdi, 64, %rsp, 64 + scrypt_shuffle %rdi, 128, %rsp, 128 + scrypt_shuffle %rdi, 192, %rsp, 192 + scrypt_shuffle %rdi, 256, %rsp, 256 + scrypt_shuffle %rdi, 320, %rsp, 320 + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax +scrypt_core_3way_avx_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + salsa8_core_3way_avx + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_avx + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_avx_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq %r8, %rcx + subq $1, %r8 +scrypt_core_3way_avx_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl %r8d, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl %r8d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl %r8d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + salsa8_core_3way_avx + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_avx + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_avx_loop2 + + scrypt_shuffle %rsp, 0, %rdi, 0 + scrypt_shuffle %rsp, 64, %rdi, 64 + scrypt_shuffle %rsp, 128, %rdi, 128 + scrypt_shuffle %rsp, 192, %rdi, 192 + scrypt_shuffle %rsp, 256, %rdi, 256 + scrypt_shuffle %rsp, 320, %rdi, 320 + + scrypt_core_3way_cleanup + ret + +#if defined(USE_XOP) +.macro salsa8_core_3way_xop_doubleround + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 +.endm + +.macro salsa8_core_3way_xop + salsa8_core_3way_xop_doubleround + salsa8_core_3way_xop_doubleround + salsa8_core_3way_xop_doubleround + salsa8_core_3way_xop_doubleround +.endm + + .p2align 6 +scrypt_core_3way_xop: + scrypt_shuffle %rdi, 0, %rsp, 0 + scrypt_shuffle %rdi, 64, %rsp, 64 + scrypt_shuffle %rdi, 128, %rsp, 128 + scrypt_shuffle %rdi, 192, %rsp, 192 + scrypt_shuffle %rdi, 256, %rsp, 256 + scrypt_shuffle %rdi, 320, %rsp, 320 + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax +scrypt_core_3way_xop_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + salsa8_core_3way_xop + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xop + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_xop_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq %r8, %rcx + subq $1, %r8 +scrypt_core_3way_xop_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl %r8d, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl %r8d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl %r8d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + salsa8_core_3way_xop + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xop + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_xop_loop2 + + scrypt_shuffle %rsp, 0, %rdi, 0 + scrypt_shuffle %rsp, 64, %rdi, 64 + scrypt_shuffle %rsp, 128, %rdi, 128 + scrypt_shuffle %rsp, 192, %rdi, 192 + scrypt_shuffle %rsp, 256, %rdi, 256 + scrypt_shuffle %rsp, 320, %rdi, 320 + + scrypt_core_3way_cleanup + ret +#endif /* USE_XOP */ +#endif /* USE_AVX */ + +.macro salsa8_core_3way_xmm_doubleround + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 +.endm + +.macro salsa8_core_3way_xmm + salsa8_core_3way_xmm_doubleround + salsa8_core_3way_xmm_doubleround + salsa8_core_3way_xmm_doubleround + salsa8_core_3way_xmm_doubleround +.endm + + .p2align 6 +scrypt_core_3way_xmm: + scrypt_shuffle %rdi, 0, %rsp, 0 + scrypt_shuffle %rdi, 64, %rsp, 64 + scrypt_shuffle %rdi, 128, %rsp, 128 + scrypt_shuffle %rdi, 192, %rsp, 192 + scrypt_shuffle %rdi, 256, %rsp, 256 + scrypt_shuffle %rdi, 320, %rsp, 320 + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax +scrypt_core_3way_xmm_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + salsa8_core_3way_xmm + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xmm + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_xmm_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq %r8, %rcx + subq $1, %r8 +scrypt_core_3way_xmm_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl %r8d, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl %r8d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl %r8d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + salsa8_core_3way_xmm + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xmm + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_xmm_loop2 + + scrypt_shuffle %rsp, 0, %rdi, 0 + scrypt_shuffle %rsp, 64, %rdi, 64 + scrypt_shuffle %rsp, 128, %rdi, 128 + scrypt_shuffle %rsp, 192, %rdi, 192 + scrypt_shuffle %rsp, 256, %rdi, 256 + scrypt_shuffle %rsp, 320, %rdi, 320 + + scrypt_core_3way_cleanup + ret + + +#if defined(USE_AVX2) + +.macro salsa8_core_6way_avx2_doubleround + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 +.endm + +.macro salsa8_core_6way_avx2 + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround +.endm + + .text + .p2align 6 + .globl scrypt_core_6way + .globl _scrypt_core_6way +scrypt_core_6way: +_scrypt_core_6way: + pushq %rbx + pushq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + vmovdqa %xmm6, 8(%rsp) + vmovdqa %xmm7, 24(%rsp) + vmovdqa %xmm8, 40(%rsp) + vmovdqa %xmm9, 56(%rsp) + vmovdqa %xmm10, 72(%rsp) + vmovdqa %xmm11, 88(%rsp) + vmovdqa %xmm12, 104(%rsp) + vmovdqa %xmm13, 120(%rsp) + vmovdqa %xmm14, 136(%rsp) + vmovdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#else + movq %rdx, %r8 +#endif + movq %rsp, %rdx + subq $768, %rsp + andq $-128, %rsp + +.macro scrypt_core_6way_cleanup + movq %rdx, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + vmovdqa 8(%rsp), %xmm6 + vmovdqa 24(%rsp), %xmm7 + vmovdqa 40(%rsp), %xmm8 + vmovdqa 56(%rsp), %xmm9 + vmovdqa 72(%rsp), %xmm10 + vmovdqa 88(%rsp), %xmm11 + vmovdqa 104(%rsp), %xmm12 + vmovdqa 120(%rsp), %xmm13 + vmovdqa 136(%rsp), %xmm14 + vmovdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx +.endm + +.macro scrypt_shuffle_pack2 src, so, dest, do + vmovdqa \so+0*16(\src), %xmm0 + vmovdqa \so+1*16(\src), %xmm1 + vmovdqa \so+2*16(\src), %xmm2 + vmovdqa \so+3*16(\src), %xmm3 + vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0 + vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1 + vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2 + vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, \do+0*32(\dest) + vmovdqa %ymm1, \do+1*32(\dest) + vmovdqa %ymm2, \do+2*32(\dest) + vmovdqa %ymm3, \do+3*32(\dest) +.endm + +.macro scrypt_shuffle_unpack2 src, so, dest, do + vmovdqa \so+0*32(\src), %ymm0 + vmovdqa \so+1*32(\src), %ymm1 + vmovdqa \so+2*32(\src), %ymm2 + vmovdqa \so+3*32(\src), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, \do+0*16(\dest) + vmovdqa %xmm1, \do+1*16(\dest) + vmovdqa %xmm2, \do+2*16(\dest) + vmovdqa %xmm3, \do+3*16(\dest) + vextracti128 $1, %ymm0, \do+128+0*16(\dest) + vextracti128 $1, %ymm1, \do+128+1*16(\dest) + vextracti128 $1, %ymm2, \do+128+2*16(\dest) + vextracti128 $1, %ymm3, \do+128+3*16(\dest) +.endm + +scrypt_core_6way_avx2: + scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128 + scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128 + scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128 + scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128 + scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128 + scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128 + + vmovdqa 0*256+4*32(%rsp), %ymm0 + vmovdqa 0*256+5*32(%rsp), %ymm1 + vmovdqa 0*256+6*32(%rsp), %ymm2 + vmovdqa 0*256+7*32(%rsp), %ymm3 + vmovdqa 1*256+4*32(%rsp), %ymm8 + vmovdqa 1*256+5*32(%rsp), %ymm9 + vmovdqa 1*256+6*32(%rsp), %ymm10 + vmovdqa 1*256+7*32(%rsp), %ymm11 + vmovdqa 2*256+4*32(%rsp), %ymm12 + vmovdqa 2*256+5*32(%rsp), %ymm13 + vmovdqa 2*256+6*32(%rsp), %ymm14 + vmovdqa 2*256+7*32(%rsp), %ymm15 + + movq %rsi, %rbx + leaq (%r8, %r8, 2), %rax + shlq $8, %rax + addq %rsi, %rax +scrypt_core_6way_avx2_loop1: + vmovdqa %ymm0, 0*256+4*32(%rbx) + vmovdqa %ymm1, 0*256+5*32(%rbx) + vmovdqa %ymm2, 0*256+6*32(%rbx) + vmovdqa %ymm3, 0*256+7*32(%rbx) + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vmovdqa %ymm8, 1*256+4*32(%rbx) + vmovdqa %ymm9, 1*256+5*32(%rbx) + vmovdqa %ymm10, 1*256+6*32(%rbx) + vmovdqa %ymm11, 1*256+7*32(%rbx) + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vmovdqa %ymm12, 2*256+4*32(%rbx) + vmovdqa %ymm13, 2*256+5*32(%rbx) + vmovdqa %ymm14, 2*256+6*32(%rbx) + vmovdqa %ymm15, 2*256+7*32(%rbx) + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rbx) + vmovdqa %ymm1, 0*256+1*32(%rbx) + vmovdqa %ymm2, 0*256+2*32(%rbx) + vmovdqa %ymm3, 0*256+3*32(%rbx) + vmovdqa %ymm8, 1*256+0*32(%rbx) + vmovdqa %ymm9, 1*256+1*32(%rbx) + vmovdqa %ymm10, 1*256+2*32(%rbx) + vmovdqa %ymm11, 1*256+3*32(%rbx) + vmovdqa %ymm12, 2*256+0*32(%rbx) + vmovdqa %ymm13, 2*256+1*32(%rbx) + vmovdqa %ymm14, 2*256+2*32(%rbx) + vmovdqa %ymm15, 2*256+3*32(%rbx) + + salsa8_core_6way_avx2 + vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vpxor 0*256+4*32(%rbx), %ymm0, %ymm0 + vpxor 0*256+5*32(%rbx), %ymm1, %ymm1 + vpxor 0*256+6*32(%rbx), %ymm2, %ymm2 + vpxor 0*256+7*32(%rbx), %ymm3, %ymm3 + vpxor 1*256+4*32(%rbx), %ymm8, %ymm8 + vpxor 1*256+5*32(%rbx), %ymm9, %ymm9 + vpxor 1*256+6*32(%rbx), %ymm10, %ymm10 + vpxor 1*256+7*32(%rbx), %ymm11, %ymm11 + vpxor 2*256+4*32(%rbx), %ymm12, %ymm12 + vpxor 2*256+5*32(%rbx), %ymm13, %ymm13 + vpxor 2*256+6*32(%rbx), %ymm14, %ymm14 + vpxor 2*256+7*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + + addq $6*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_6way_avx2_loop1 + + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + movq %r8, %rcx + leaq -1(%r8), %r11 +scrypt_core_6way_avx2_loop2: + vmovd %xmm0, %ebp + vmovd %xmm8, %ebx + vmovd %xmm12, %eax + vextracti128 $1, %ymm0, %xmm4 + vextracti128 $1, %ymm8, %xmm5 + vextracti128 $1, %ymm12, %xmm6 + vmovd %xmm4, %r8d + vmovd %xmm5, %r9d + vmovd %xmm6, %r10d + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + andl %r11d, %ebp + leaq 0(%rbp, %rbp, 2), %rbp + shll $8, %ebp + andl %r11d, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $8, %ebx + andl %r11d, %eax + leaq 2(%rax, %rax, 2), %rax + shll $8, %eax + andl %r11d, %r8d + leaq 0(%r8, %r8, 2), %r8 + shll $8, %r8d + andl %r11d, %r9d + leaq 1(%r9, %r9, 2), %r9 + shll $8, %r9d + andl %r11d, %r10d + leaq 2(%r10, %r10, 2), %r10 + shll $8, %r10d + vmovdqa 0*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 0*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 0*32(%rsi, %rax), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rax), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rax), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rax), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vmovdqa 4*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 4*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 4*32(%rsi, %rax), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rax), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rax), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rax), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + vpxor 0*256+4*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+5*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+6*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+7*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+4*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+5*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+6*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+7*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+4*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+5*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+6*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + subq $1, %rcx + ja scrypt_core_6way_avx2_loop2 + + scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0 + scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64 + scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0 + scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64 + scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0 + scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64 + + scrypt_core_6way_cleanup + ret + +#endif /* USE_AVX2 */ + +#endif diff --git a/scrypt-x86.S b/scrypt-x86.S index 5ab7eda65..0f5dd82a3 100644 --- a/scrypt-x86.S +++ b/scrypt-x86.S @@ -32,42 +32,11 @@ #if defined(USE_ASM) && defined(__i386__) -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %eax - movl \so+44(\src), %ebx - movl \so+28(\src), %ecx - movl \so+12(\src), %edx - movl %eax, \do+12(\dest) - movl %ebx, \do+28(\dest) - movl %ecx, \do+44(\dest) - movl %edx, \do+60(\dest) - movl \so+40(\src), %eax - movl \so+8(\src), %ebx - movl \so+48(\src), %ecx - movl \so+16(\src), %edx - movl %eax, \do+8(\dest) - movl %ebx, \do+40(\dest) - movl %ecx, \do+16(\dest) - movl %edx, \do+48(\dest) - movl \so+20(\src), %eax - movl \so+4(\src), %ebx - movl \so+52(\src), %ecx - movl \so+36(\src), %edx - movl %eax, \do+4(\dest) - movl %ebx, \do+20(\dest) - movl %ecx, \do+36(\dest) - movl %edx, \do+52(\dest) - movl \so+0(\src), %eax - movl \so+24(\src), %ebx - movl \so+32(\src), %ecx - movl \so+56(\src), %edx - movl %eax, \do+0(\dest) - movl %ebx, \do+24(\dest) - movl %ecx, \do+32(\dest) - movl %edx, \do+56(\dest) -.endm -.macro salsa8_core_gen_quadround + + .text + .p2align 5 +salsa8_core_gen: movl 52(%esp), %ecx movl 4(%esp), %edx movl 20(%esp), %ebx @@ -385,227 +354,2011 @@ roll $18, %edi xorl %edi, %ebp movl %ebp, 64(%esp) -.endm - - .text - .p2align 5 -salsa8_core_gen: - salsa8_core_gen_quadround - salsa8_core_gen_quadround - ret - - - .text - .p2align 5 - .globl scrypt_core - .globl _scrypt_core -scrypt_core: -_scrypt_core: - pushl %ebx - pushl %ebp - pushl %edi - pushl %esi - - /* Check for SSE2 availability */ - movl $1, %eax - cpuid - andl $0x04000000, %edx - jnz scrypt_core_sse2 - -scrypt_core_gen: - movl 20(%esp), %edi - movl 24(%esp), %esi + movl 52(%esp), %ecx + movl 4(%esp), %edx + movl 20(%esp), %ebx + movl 8(%esp), %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 4(%esp) + movl 36(%esp), %edi + leal (%edx, %ebx), %ebp + roll $9, %ebp + xorl %ebp, %edi + movl 24(%esp), %ebp + movl %edi, 8(%esp) + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 40(%esp), %ebx + movl %ecx, 20(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 24(%esp) + movl 56(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 36(%esp) movl 28(%esp), %ecx - subl $72, %esp - -.macro scrypt_core_macro1a p, q - movl \p(%edi), %eax - movl \q(%edi), %edx - movl %eax, \p(%esi) - movl %edx, \q(%esi) - xorl %edx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro1b p, q - movl \p(%edi), %eax - xorl \p(%esi, %edx), %eax - movl \q(%edi), %ebx - xorl \q(%esi, %edx), %ebx - movl %ebx, \q(%edi) - xorl %ebx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro2 p, q - movl \p(%esp), %eax - addl \p(%edi), %eax - movl %eax, \p(%edi) - xorl \q(%edi), %eax - movl %eax, \q(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro3 p, q - movl \p(%esp), %eax - addl \q(%edi), %eax - movl %eax, \q(%edi) -.endm - - shll $7, %ecx + movl %edx, 28(%esp) + movl 44(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 60(%esp), %ebx + movl %esi, 40(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 44(%esp) + movl 12(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 12(%esp) + movl 48(%esp), %esi + movl %ebp, 48(%esp) + movl 64(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl 32(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 32(%esp) + movl %ebx, %ecx + movl %edx, 52(%esp) + movl 28(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 40(%esp), %ebx + movl %esi, 28(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 40(%esp) + movl 12(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 12(%esp) + movl 4(%esp), %esi + movl %ebp, 4(%esp) + movl 48(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 48(%esp) + movl 32(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 32(%esp) + movl 24(%esp), %ecx + movl %edx, 24(%esp) + movl 52(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 28(%esp), %ebx + movl %esi, 28(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 52(%esp) + movl 8(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 8(%esp) + movl 44(%esp), %esi + movl %ebp, 44(%esp) + movl 4(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 20(%esp), %ebx + movl %ecx, 4(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl 36(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 20(%esp) + movl %ebx, %ecx + movl %edx, 36(%esp) + movl 24(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 28(%esp), %ebx + movl %esi, 24(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 28(%esp) + xorl %esi, %ebp + movl 8(%esp), %esi + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl 40(%esp), %edi + movl %ebp, 8(%esp) + movl 44(%esp), %ebp + movl %esi, 40(%esp) + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 4(%esp), %ebx + movl %ecx, 44(%esp) addl %esi, %ecx -scrypt_core_gen_loop1: - movl %esi, 64(%esp) - movl %ecx, 68(%esp) - - scrypt_core_macro1a 0, 64 - scrypt_core_macro1a 4, 68 - scrypt_core_macro1a 8, 72 - scrypt_core_macro1a 12, 76 - scrypt_core_macro1a 16, 80 - scrypt_core_macro1a 20, 84 - scrypt_core_macro1a 24, 88 - scrypt_core_macro1a 28, 92 - scrypt_core_macro1a 32, 96 - scrypt_core_macro1a 36, 100 - scrypt_core_macro1a 40, 104 - scrypt_core_macro1a 44, 108 - scrypt_core_macro1a 48, 112 - scrypt_core_macro1a 52, 116 - scrypt_core_macro1a 56, 120 - scrypt_core_macro1a 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 - - movl 64(%esp), %esi - movl 68(%esp), %ecx - addl $128, %esi - cmpl %ecx, %esi - jne scrypt_core_gen_loop1 - - movl 96(%esp), %esi - movl 100(%esp), %ecx - movl %ecx, %eax - subl $1, %eax - movl %eax, 100(%esp) -scrypt_core_gen_loop2: - movl %ecx, 68(%esp) - - movl 64(%edi), %edx - andl 100(%esp), %edx - shll $7, %edx - - scrypt_core_macro1b 0, 64 - scrypt_core_macro1b 4, 68 - scrypt_core_macro1b 8, 72 - scrypt_core_macro1b 12, 76 - scrypt_core_macro1b 16, 80 - scrypt_core_macro1b 20, 84 - scrypt_core_macro1b 24, 88 - scrypt_core_macro1b 28, 92 - scrypt_core_macro1b 32, 96 - scrypt_core_macro1b 36, 100 - scrypt_core_macro1b 40, 104 - scrypt_core_macro1b 44, 108 - scrypt_core_macro1b 48, 112 - scrypt_core_macro1b 52, 116 - scrypt_core_macro1b 56, 120 - scrypt_core_macro1b 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - movl 96(%esp), %esi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 - - movl 68(%esp), %ecx - subl $1, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 4(%esp) + movl 20(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 56(%esp) + movl 48(%esp), %ecx + movl %edx, 20(%esp) + movl 36(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 24(%esp), %ebx + movl %edi, 24(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 60(%esp) + movl 12(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 12(%esp) + movl 52(%esp), %edi + movl %ebp, 36(%esp) + movl 8(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl 32(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 32(%esp) + movl %ebx, %ecx + movl %edx, 48(%esp) + movl 20(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 24(%esp), %ebx + movl %edi, 20(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 8(%esp) + movl 12(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 12(%esp) + movl 28(%esp), %edi + movl %ebp, 52(%esp) + movl 36(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 28(%esp) + movl 32(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 32(%esp) + movl 4(%esp), %ecx + movl %edx, 4(%esp) + movl 48(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 20(%esp), %ebx + movl %edi, 20(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 48(%esp) + movl 40(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 36(%esp) + movl 60(%esp), %edi + movl %ebp, 24(%esp) + movl 52(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 44(%esp), %ebx + movl %ecx, 40(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 52(%esp) + movl 56(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 56(%esp) + addl %esi, %ebx + movl %edx, 44(%esp) + roll $13, %ebx + xorl %ebx, %edi + movl %edi, 60(%esp) + addl %esi, %edi + roll $18, %edi + xorl %edi, %ebp + movl %ebp, 64(%esp) + ret + + + .text + .p2align 5 + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: + pushl %ebx + pushl %ebp + pushl %edi + pushl %esi + + /* Check for SSE2 availability */ + movl $1, %eax + cpuid + andl $0x04000000, %edx + jnz scrypt_core_sse2 + +scrypt_core_gen: + movl 20(%esp), %edi + movl 24(%esp), %esi + movl 28(%esp), %ecx + subl $72, %esp + + + + + + shll $7, %ecx + addl %esi, %ecx +scrypt_core_gen_loop1: + movl %esi, 64(%esp) + movl %ecx, 68(%esp) + + movl 0(%edi), %eax + movl 64(%edi), %edx + movl %eax, 0(%esi) + movl %edx, 64(%esi) + xorl %edx, %eax + movl %eax, 0(%edi) + movl %eax, 0(%esp) + movl 4(%edi), %eax + movl 68(%edi), %edx + movl %eax, 4(%esi) + movl %edx, 68(%esi) + xorl %edx, %eax + movl %eax, 4(%edi) + movl %eax, 4(%esp) + movl 8(%edi), %eax + movl 72(%edi), %edx + movl %eax, 8(%esi) + movl %edx, 72(%esi) + xorl %edx, %eax + movl %eax, 8(%edi) + movl %eax, 8(%esp) + movl 12(%edi), %eax + movl 76(%edi), %edx + movl %eax, 12(%esi) + movl %edx, 76(%esi) + xorl %edx, %eax + movl %eax, 12(%edi) + movl %eax, 12(%esp) + movl 16(%edi), %eax + movl 80(%edi), %edx + movl %eax, 16(%esi) + movl %edx, 80(%esi) + xorl %edx, %eax + movl %eax, 16(%edi) + movl %eax, 16(%esp) + movl 20(%edi), %eax + movl 84(%edi), %edx + movl %eax, 20(%esi) + movl %edx, 84(%esi) + xorl %edx, %eax + movl %eax, 20(%edi) + movl %eax, 20(%esp) + movl 24(%edi), %eax + movl 88(%edi), %edx + movl %eax, 24(%esi) + movl %edx, 88(%esi) + xorl %edx, %eax + movl %eax, 24(%edi) + movl %eax, 24(%esp) + movl 28(%edi), %eax + movl 92(%edi), %edx + movl %eax, 28(%esi) + movl %edx, 92(%esi) + xorl %edx, %eax + movl %eax, 28(%edi) + movl %eax, 28(%esp) + movl 32(%edi), %eax + movl 96(%edi), %edx + movl %eax, 32(%esi) + movl %edx, 96(%esi) + xorl %edx, %eax + movl %eax, 32(%edi) + movl %eax, 32(%esp) + movl 36(%edi), %eax + movl 100(%edi), %edx + movl %eax, 36(%esi) + movl %edx, 100(%esi) + xorl %edx, %eax + movl %eax, 36(%edi) + movl %eax, 36(%esp) + movl 40(%edi), %eax + movl 104(%edi), %edx + movl %eax, 40(%esi) + movl %edx, 104(%esi) + xorl %edx, %eax + movl %eax, 40(%edi) + movl %eax, 40(%esp) + movl 44(%edi), %eax + movl 108(%edi), %edx + movl %eax, 44(%esi) + movl %edx, 108(%esi) + xorl %edx, %eax + movl %eax, 44(%edi) + movl %eax, 44(%esp) + movl 48(%edi), %eax + movl 112(%edi), %edx + movl %eax, 48(%esi) + movl %edx, 112(%esi) + xorl %edx, %eax + movl %eax, 48(%edi) + movl %eax, 48(%esp) + movl 52(%edi), %eax + movl 116(%edi), %edx + movl %eax, 52(%esi) + movl %edx, 116(%esi) + xorl %edx, %eax + movl %eax, 52(%edi) + movl %eax, 52(%esp) + movl 56(%edi), %eax + movl 120(%edi), %edx + movl %eax, 56(%esi) + movl %edx, 120(%esi) + xorl %edx, %eax + movl %eax, 56(%edi) + movl %eax, 56(%esp) + movl 60(%edi), %eax + movl 124(%edi), %edx + movl %eax, 60(%esi) + movl %edx, 124(%esi) + xorl %edx, %eax + movl %eax, 60(%edi) + movl %eax, 60(%esp) + + call salsa8_core_gen + + movl 92(%esp), %edi + movl 0(%esp), %eax + addl 0(%edi), %eax + movl %eax, 0(%edi) + xorl 64(%edi), %eax + movl %eax, 64(%edi) + movl %eax, 0(%esp) + movl 4(%esp), %eax + addl 4(%edi), %eax + movl %eax, 4(%edi) + xorl 68(%edi), %eax + movl %eax, 68(%edi) + movl %eax, 4(%esp) + movl 8(%esp), %eax + addl 8(%edi), %eax + movl %eax, 8(%edi) + xorl 72(%edi), %eax + movl %eax, 72(%edi) + movl %eax, 8(%esp) + movl 12(%esp), %eax + addl 12(%edi), %eax + movl %eax, 12(%edi) + xorl 76(%edi), %eax + movl %eax, 76(%edi) + movl %eax, 12(%esp) + movl 16(%esp), %eax + addl 16(%edi), %eax + movl %eax, 16(%edi) + xorl 80(%edi), %eax + movl %eax, 80(%edi) + movl %eax, 16(%esp) + movl 20(%esp), %eax + addl 20(%edi), %eax + movl %eax, 20(%edi) + xorl 84(%edi), %eax + movl %eax, 84(%edi) + movl %eax, 20(%esp) + movl 24(%esp), %eax + addl 24(%edi), %eax + movl %eax, 24(%edi) + xorl 88(%edi), %eax + movl %eax, 88(%edi) + movl %eax, 24(%esp) + movl 28(%esp), %eax + addl 28(%edi), %eax + movl %eax, 28(%edi) + xorl 92(%edi), %eax + movl %eax, 92(%edi) + movl %eax, 28(%esp) + movl 32(%esp), %eax + addl 32(%edi), %eax + movl %eax, 32(%edi) + xorl 96(%edi), %eax + movl %eax, 96(%edi) + movl %eax, 32(%esp) + movl 36(%esp), %eax + addl 36(%edi), %eax + movl %eax, 36(%edi) + xorl 100(%edi), %eax + movl %eax, 100(%edi) + movl %eax, 36(%esp) + movl 40(%esp), %eax + addl 40(%edi), %eax + movl %eax, 40(%edi) + xorl 104(%edi), %eax + movl %eax, 104(%edi) + movl %eax, 40(%esp) + movl 44(%esp), %eax + addl 44(%edi), %eax + movl %eax, 44(%edi) + xorl 108(%edi), %eax + movl %eax, 108(%edi) + movl %eax, 44(%esp) + movl 48(%esp), %eax + addl 48(%edi), %eax + movl %eax, 48(%edi) + xorl 112(%edi), %eax + movl %eax, 112(%edi) + movl %eax, 48(%esp) + movl 52(%esp), %eax + addl 52(%edi), %eax + movl %eax, 52(%edi) + xorl 116(%edi), %eax + movl %eax, 116(%edi) + movl %eax, 52(%esp) + movl 56(%esp), %eax + addl 56(%edi), %eax + movl %eax, 56(%edi) + xorl 120(%edi), %eax + movl %eax, 120(%edi) + movl %eax, 56(%esp) + movl 60(%esp), %eax + addl 60(%edi), %eax + movl %eax, 60(%edi) + xorl 124(%edi), %eax + movl %eax, 124(%edi) + movl %eax, 60(%esp) + + call salsa8_core_gen + + movl 92(%esp), %edi + movl 0(%esp), %eax + addl 64(%edi), %eax + movl %eax, 64(%edi) + movl 4(%esp), %eax + addl 68(%edi), %eax + movl %eax, 68(%edi) + movl 8(%esp), %eax + addl 72(%edi), %eax + movl %eax, 72(%edi) + movl 12(%esp), %eax + addl 76(%edi), %eax + movl %eax, 76(%edi) + movl 16(%esp), %eax + addl 80(%edi), %eax + movl %eax, 80(%edi) + movl 20(%esp), %eax + addl 84(%edi), %eax + movl %eax, 84(%edi) + movl 24(%esp), %eax + addl 88(%edi), %eax + movl %eax, 88(%edi) + movl 28(%esp), %eax + addl 92(%edi), %eax + movl %eax, 92(%edi) + movl 32(%esp), %eax + addl 96(%edi), %eax + movl %eax, 96(%edi) + movl 36(%esp), %eax + addl 100(%edi), %eax + movl %eax, 100(%edi) + movl 40(%esp), %eax + addl 104(%edi), %eax + movl %eax, 104(%edi) + movl 44(%esp), %eax + addl 108(%edi), %eax + movl %eax, 108(%edi) + movl 48(%esp), %eax + addl 112(%edi), %eax + movl %eax, 112(%edi) + movl 52(%esp), %eax + addl 116(%edi), %eax + movl %eax, 116(%edi) + movl 56(%esp), %eax + addl 120(%edi), %eax + movl %eax, 120(%edi) + movl 60(%esp), %eax + addl 124(%edi), %eax + movl %eax, 124(%edi) + + movl 64(%esp), %esi + movl 68(%esp), %ecx + addl $128, %esi + cmpl %ecx, %esi + jne scrypt_core_gen_loop1 + + movl 96(%esp), %esi + movl 100(%esp), %ecx + movl %ecx, %eax + subl $1, %eax + movl %eax, 100(%esp) +scrypt_core_gen_loop2: + movl %ecx, 68(%esp) + + movl 64(%edi), %edx + andl 100(%esp), %edx + shll $7, %edx + + movl 0(%edi), %eax + xorl 0(%esi, %edx), %eax + movl 64(%edi), %ebx + xorl 64(%esi, %edx), %ebx + movl %ebx, 64(%edi) + xorl %ebx, %eax + movl %eax, 0(%edi) + movl %eax, 0(%esp) + movl 4(%edi), %eax + xorl 4(%esi, %edx), %eax + movl 68(%edi), %ebx + xorl 68(%esi, %edx), %ebx + movl %ebx, 68(%edi) + xorl %ebx, %eax + movl %eax, 4(%edi) + movl %eax, 4(%esp) + movl 8(%edi), %eax + xorl 8(%esi, %edx), %eax + movl 72(%edi), %ebx + xorl 72(%esi, %edx), %ebx + movl %ebx, 72(%edi) + xorl %ebx, %eax + movl %eax, 8(%edi) + movl %eax, 8(%esp) + movl 12(%edi), %eax + xorl 12(%esi, %edx), %eax + movl 76(%edi), %ebx + xorl 76(%esi, %edx), %ebx + movl %ebx, 76(%edi) + xorl %ebx, %eax + movl %eax, 12(%edi) + movl %eax, 12(%esp) + movl 16(%edi), %eax + xorl 16(%esi, %edx), %eax + movl 80(%edi), %ebx + xorl 80(%esi, %edx), %ebx + movl %ebx, 80(%edi) + xorl %ebx, %eax + movl %eax, 16(%edi) + movl %eax, 16(%esp) + movl 20(%edi), %eax + xorl 20(%esi, %edx), %eax + movl 84(%edi), %ebx + xorl 84(%esi, %edx), %ebx + movl %ebx, 84(%edi) + xorl %ebx, %eax + movl %eax, 20(%edi) + movl %eax, 20(%esp) + movl 24(%edi), %eax + xorl 24(%esi, %edx), %eax + movl 88(%edi), %ebx + xorl 88(%esi, %edx), %ebx + movl %ebx, 88(%edi) + xorl %ebx, %eax + movl %eax, 24(%edi) + movl %eax, 24(%esp) + movl 28(%edi), %eax + xorl 28(%esi, %edx), %eax + movl 92(%edi), %ebx + xorl 92(%esi, %edx), %ebx + movl %ebx, 92(%edi) + xorl %ebx, %eax + movl %eax, 28(%edi) + movl %eax, 28(%esp) + movl 32(%edi), %eax + xorl 32(%esi, %edx), %eax + movl 96(%edi), %ebx + xorl 96(%esi, %edx), %ebx + movl %ebx, 96(%edi) + xorl %ebx, %eax + movl %eax, 32(%edi) + movl %eax, 32(%esp) + movl 36(%edi), %eax + xorl 36(%esi, %edx), %eax + movl 100(%edi), %ebx + xorl 100(%esi, %edx), %ebx + movl %ebx, 100(%edi) + xorl %ebx, %eax + movl %eax, 36(%edi) + movl %eax, 36(%esp) + movl 40(%edi), %eax + xorl 40(%esi, %edx), %eax + movl 104(%edi), %ebx + xorl 104(%esi, %edx), %ebx + movl %ebx, 104(%edi) + xorl %ebx, %eax + movl %eax, 40(%edi) + movl %eax, 40(%esp) + movl 44(%edi), %eax + xorl 44(%esi, %edx), %eax + movl 108(%edi), %ebx + xorl 108(%esi, %edx), %ebx + movl %ebx, 108(%edi) + xorl %ebx, %eax + movl %eax, 44(%edi) + movl %eax, 44(%esp) + movl 48(%edi), %eax + xorl 48(%esi, %edx), %eax + movl 112(%edi), %ebx + xorl 112(%esi, %edx), %ebx + movl %ebx, 112(%edi) + xorl %ebx, %eax + movl %eax, 48(%edi) + movl %eax, 48(%esp) + movl 52(%edi), %eax + xorl 52(%esi, %edx), %eax + movl 116(%edi), %ebx + xorl 116(%esi, %edx), %ebx + movl %ebx, 116(%edi) + xorl %ebx, %eax + movl %eax, 52(%edi) + movl %eax, 52(%esp) + movl 56(%edi), %eax + xorl 56(%esi, %edx), %eax + movl 120(%edi), %ebx + xorl 120(%esi, %edx), %ebx + movl %ebx, 120(%edi) + xorl %ebx, %eax + movl %eax, 56(%edi) + movl %eax, 56(%esp) + movl 60(%edi), %eax + xorl 60(%esi, %edx), %eax + movl 124(%edi), %ebx + xorl 124(%esi, %edx), %ebx + movl %ebx, 124(%edi) + xorl %ebx, %eax + movl %eax, 60(%edi) + movl %eax, 60(%esp) + + call salsa8_core_gen + + movl 92(%esp), %edi + movl 0(%esp), %eax + addl 0(%edi), %eax + movl %eax, 0(%edi) + xorl 64(%edi), %eax + movl %eax, 64(%edi) + movl %eax, 0(%esp) + movl 4(%esp), %eax + addl 4(%edi), %eax + movl %eax, 4(%edi) + xorl 68(%edi), %eax + movl %eax, 68(%edi) + movl %eax, 4(%esp) + movl 8(%esp), %eax + addl 8(%edi), %eax + movl %eax, 8(%edi) + xorl 72(%edi), %eax + movl %eax, 72(%edi) + movl %eax, 8(%esp) + movl 12(%esp), %eax + addl 12(%edi), %eax + movl %eax, 12(%edi) + xorl 76(%edi), %eax + movl %eax, 76(%edi) + movl %eax, 12(%esp) + movl 16(%esp), %eax + addl 16(%edi), %eax + movl %eax, 16(%edi) + xorl 80(%edi), %eax + movl %eax, 80(%edi) + movl %eax, 16(%esp) + movl 20(%esp), %eax + addl 20(%edi), %eax + movl %eax, 20(%edi) + xorl 84(%edi), %eax + movl %eax, 84(%edi) + movl %eax, 20(%esp) + movl 24(%esp), %eax + addl 24(%edi), %eax + movl %eax, 24(%edi) + xorl 88(%edi), %eax + movl %eax, 88(%edi) + movl %eax, 24(%esp) + movl 28(%esp), %eax + addl 28(%edi), %eax + movl %eax, 28(%edi) + xorl 92(%edi), %eax + movl %eax, 92(%edi) + movl %eax, 28(%esp) + movl 32(%esp), %eax + addl 32(%edi), %eax + movl %eax, 32(%edi) + xorl 96(%edi), %eax + movl %eax, 96(%edi) + movl %eax, 32(%esp) + movl 36(%esp), %eax + addl 36(%edi), %eax + movl %eax, 36(%edi) + xorl 100(%edi), %eax + movl %eax, 100(%edi) + movl %eax, 36(%esp) + movl 40(%esp), %eax + addl 40(%edi), %eax + movl %eax, 40(%edi) + xorl 104(%edi), %eax + movl %eax, 104(%edi) + movl %eax, 40(%esp) + movl 44(%esp), %eax + addl 44(%edi), %eax + movl %eax, 44(%edi) + xorl 108(%edi), %eax + movl %eax, 108(%edi) + movl %eax, 44(%esp) + movl 48(%esp), %eax + addl 48(%edi), %eax + movl %eax, 48(%edi) + xorl 112(%edi), %eax + movl %eax, 112(%edi) + movl %eax, 48(%esp) + movl 52(%esp), %eax + addl 52(%edi), %eax + movl %eax, 52(%edi) + xorl 116(%edi), %eax + movl %eax, 116(%edi) + movl %eax, 52(%esp) + movl 56(%esp), %eax + addl 56(%edi), %eax + movl %eax, 56(%edi) + xorl 120(%edi), %eax + movl %eax, 120(%edi) + movl %eax, 56(%esp) + movl 60(%esp), %eax + addl 60(%edi), %eax + movl %eax, 60(%edi) + xorl 124(%edi), %eax + movl %eax, 124(%edi) + movl %eax, 60(%esp) + + call salsa8_core_gen + + movl 92(%esp), %edi + movl 96(%esp), %esi + movl 0(%esp), %eax + addl 64(%edi), %eax + movl %eax, 64(%edi) + movl 4(%esp), %eax + addl 68(%edi), %eax + movl %eax, 68(%edi) + movl 8(%esp), %eax + addl 72(%edi), %eax + movl %eax, 72(%edi) + movl 12(%esp), %eax + addl 76(%edi), %eax + movl %eax, 76(%edi) + movl 16(%esp), %eax + addl 80(%edi), %eax + movl %eax, 80(%edi) + movl 20(%esp), %eax + addl 84(%edi), %eax + movl %eax, 84(%edi) + movl 24(%esp), %eax + addl 88(%edi), %eax + movl %eax, 88(%edi) + movl 28(%esp), %eax + addl 92(%edi), %eax + movl %eax, 92(%edi) + movl 32(%esp), %eax + addl 96(%edi), %eax + movl %eax, 96(%edi) + movl 36(%esp), %eax + addl 100(%edi), %eax + movl %eax, 100(%edi) + movl 40(%esp), %eax + addl 104(%edi), %eax + movl %eax, 104(%edi) + movl 44(%esp), %eax + addl 108(%edi), %eax + movl %eax, 108(%edi) + movl 48(%esp), %eax + addl 112(%edi), %eax + movl %eax, 112(%edi) + movl 52(%esp), %eax + addl 116(%edi), %eax + movl %eax, 116(%edi) + movl 56(%esp), %eax + addl 120(%edi), %eax + movl %eax, 120(%edi) + movl 60(%esp), %eax + addl 124(%edi), %eax + movl %eax, 124(%edi) + + movl 68(%esp), %ecx + subl $1, %ecx ja scrypt_core_gen_loop2 - addl $72, %esp - popl %esi - popl %edi - popl %ebp - popl %ebx - ret - - -.macro salsa8_core_sse2_doubleround + addl $72, %esp + popl %esi + popl %edi + popl %ebp + popl %ebx + ret + + + + + .p2align 5 +scrypt_core_sse2: + movl 20(%esp), %edi + movl 24(%esp), %esi + movl %esp, %ebp + subl $128, %esp + andl $-16, %esp + + movl 0+60(%edi), %eax + movl 0+44(%edi), %ebx + movl 0+28(%edi), %ecx + movl 0+12(%edi), %edx + movl %eax, 0+12(%esp) + movl %ebx, 0+28(%esp) + movl %ecx, 0+44(%esp) + movl %edx, 0+60(%esp) + movl 0+40(%edi), %eax + movl 0+8(%edi), %ebx + movl 0+48(%edi), %ecx + movl 0+16(%edi), %edx + movl %eax, 0+8(%esp) + movl %ebx, 0+40(%esp) + movl %ecx, 0+16(%esp) + movl %edx, 0+48(%esp) + movl 0+20(%edi), %eax + movl 0+4(%edi), %ebx + movl 0+52(%edi), %ecx + movl 0+36(%edi), %edx + movl %eax, 0+4(%esp) + movl %ebx, 0+20(%esp) + movl %ecx, 0+36(%esp) + movl %edx, 0+52(%esp) + movl 0+0(%edi), %eax + movl 0+24(%edi), %ebx + movl 0+32(%edi), %ecx + movl 0+56(%edi), %edx + movl %eax, 0+0(%esp) + movl %ebx, 0+24(%esp) + movl %ecx, 0+32(%esp) + movl %edx, 0+56(%esp) + movl 64+60(%edi), %eax + movl 64+44(%edi), %ebx + movl 64+28(%edi), %ecx + movl 64+12(%edi), %edx + movl %eax, 64+12(%esp) + movl %ebx, 64+28(%esp) + movl %ecx, 64+44(%esp) + movl %edx, 64+60(%esp) + movl 64+40(%edi), %eax + movl 64+8(%edi), %ebx + movl 64+48(%edi), %ecx + movl 64+16(%edi), %edx + movl %eax, 64+8(%esp) + movl %ebx, 64+40(%esp) + movl %ecx, 64+16(%esp) + movl %edx, 64+48(%esp) + movl 64+20(%edi), %eax + movl 64+4(%edi), %ebx + movl 64+52(%edi), %ecx + movl 64+36(%edi), %edx + movl %eax, 64+4(%esp) + movl %ebx, 64+20(%esp) + movl %ecx, 64+36(%esp) + movl %edx, 64+52(%esp) + movl 64+0(%edi), %eax + movl 64+24(%edi), %ebx + movl 64+32(%edi), %ecx + movl 64+56(%edi), %edx + movl %eax, 64+0(%esp) + movl %ebx, 64+24(%esp) + movl %ecx, 64+32(%esp) + movl %edx, 64+56(%esp) + + movdqa 96(%esp), %xmm6 + movdqa 112(%esp), %xmm7 + + movl %esi, %edx + movl 28(%ebp), %ecx + shll $7, %ecx + addl %esi, %ecx +scrypt_core_sse2_loop1: + movdqa 0(%esp), %xmm0 + movdqa 16(%esp), %xmm1 + movdqa 32(%esp), %xmm2 + movdqa 48(%esp), %xmm3 + movdqa 64(%esp), %xmm4 + movdqa 80(%esp), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, 0(%edx) + movdqa %xmm1, 16(%edx) + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm2, 32(%edx) + movdqa %xmm3, 48(%edx) + movdqa %xmm4, 64(%edx) + movdqa %xmm5, 80(%edx) + movdqa %xmm6, 96(%edx) + movdqa %xmm7, 112(%edx) + + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + paddd 0(%edx), %xmm0 + paddd 16(%edx), %xmm1 + paddd 32(%edx), %xmm2 + paddd 48(%edx), %xmm3 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + + pxor 64(%esp), %xmm0 + pxor 80(%esp), %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + paddd 64(%esp), %xmm0 + paddd 80(%esp), %xmm1 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + + addl $128, %edx + cmpl %ecx, %edx + jne scrypt_core_sse2_loop1 + + movdqa 64(%esp), %xmm4 + movdqa 80(%esp), %xmm5 + + movl 28(%ebp), %ecx + movl %ecx, %eax + subl $1, %eax +scrypt_core_sse2_loop2: + movd %xmm4, %edx + movdqa 0(%esp), %xmm0 + movdqa 16(%esp), %xmm1 + movdqa 32(%esp), %xmm2 + movdqa 48(%esp), %xmm3 + andl %eax, %edx + shll $7, %edx + pxor 0(%esi, %edx), %xmm0 + pxor 16(%esi, %edx), %xmm1 + pxor 32(%esi, %edx), %xmm2 + pxor 48(%esi, %edx), %xmm3 + + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + paddd 0(%esp), %xmm0 + paddd 16(%esp), %xmm1 + paddd 32(%esp), %xmm2 + paddd 48(%esp), %xmm3 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + + pxor 64(%esi, %edx), %xmm0 + pxor 80(%esi, %edx), %xmm1 + pxor 96(%esi, %edx), %xmm2 + pxor 112(%esi, %edx), %xmm3 + pxor 64(%esp), %xmm0 + pxor 80(%esp), %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 movdqa %xmm1, %xmm4 paddd %xmm0, %xmm4 movdqa %xmm4, %xmm5 @@ -675,133 +2428,144 @@ scrypt_core_gen_loop2: pxor %xmm4, %xmm0 pshufd $0x39, %xmm3, %xmm3 pxor %xmm5, %xmm0 -.endm - -.macro salsa8_core_sse2 - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround -.endm - - .p2align 5 -scrypt_core_sse2: - movl 20(%esp), %edi - movl 24(%esp), %esi - movl %esp, %ebp - subl $128, %esp - andl $-16, %esp + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 - scrypt_shuffle %edi, 0, %esp, 0 - scrypt_shuffle %edi, 64, %esp, 64 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 - movdqa 96(%esp), %xmm6 - movdqa 112(%esp), %xmm7 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 - movl %esi, %edx - movl 28(%ebp), %ecx - shll $7, %ecx - addl %esi, %ecx -scrypt_core_sse2_loop1: - movdqa 0(%esp), %xmm0 - movdqa 16(%esp), %xmm1 - movdqa 32(%esp), %xmm2 - movdqa 48(%esp), %xmm3 - movdqa 64(%esp), %xmm4 - movdqa 80(%esp), %xmm5 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 pxor %xmm5, %xmm1 - movdqa %xmm0, 0(%edx) - movdqa %xmm1, 16(%edx) - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm2, 32(%edx) - movdqa %xmm3, 48(%edx) - movdqa %xmm4, 64(%edx) - movdqa %xmm5, 80(%edx) - movdqa %xmm6, 96(%edx) - movdqa %xmm7, 112(%edx) - salsa8_core_sse2 - paddd 0(%edx), %xmm0 - paddd 16(%edx), %xmm1 - paddd 32(%edx), %xmm2 - paddd 48(%edx), %xmm3 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 - pxor 64(%esp), %xmm0 - pxor 80(%esp), %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 - salsa8_core_sse2 - paddd 64(%esp), %xmm0 - paddd 80(%esp), %xmm1 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 - addl $128, %edx - cmpl %ecx, %edx - jne scrypt_core_sse2_loop1 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 - movdqa 64(%esp), %xmm4 - movdqa 80(%esp), %xmm5 + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 - movl 28(%ebp), %ecx - movl %ecx, %eax - subl $1, %eax -scrypt_core_sse2_loop2: - movd %xmm4, %edx - movdqa 0(%esp), %xmm0 - movdqa 16(%esp), %xmm1 - movdqa 32(%esp), %xmm2 - movdqa 48(%esp), %xmm3 - andl %eax, %edx - shll $7, %edx - pxor 0(%esi, %edx), %xmm0 - pxor 16(%esi, %edx), %xmm1 - pxor 32(%esi, %edx), %xmm2 - pxor 48(%esi, %edx), %xmm3 + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 pxor %xmm5, %xmm1 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - salsa8_core_sse2 - paddd 0(%esp), %xmm0 - paddd 16(%esp), %xmm1 - paddd 32(%esp), %xmm2 - paddd 48(%esp), %xmm3 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - pxor 64(%esi, %edx), %xmm0 - pxor 80(%esi, %edx), %xmm1 - pxor 96(%esi, %edx), %xmm2 - pxor 112(%esi, %edx), %xmm3 - pxor 64(%esp), %xmm0 - pxor 80(%esp), %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 - salsa8_core_sse2 + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd %xmm2, %xmm6 @@ -817,8 +2581,70 @@ scrypt_core_sse2_loop2: movdqa %xmm6, 96(%esp) movdqa %xmm7, 112(%esp) - scrypt_shuffle %esp, 0, %edi, 0 - scrypt_shuffle %esp, 64, %edi, 64 + movl 0+60(%esp), %eax + movl 0+44(%esp), %ebx + movl 0+28(%esp), %ecx + movl 0+12(%esp), %edx + movl %eax, 0+12(%edi) + movl %ebx, 0+28(%edi) + movl %ecx, 0+44(%edi) + movl %edx, 0+60(%edi) + movl 0+40(%esp), %eax + movl 0+8(%esp), %ebx + movl 0+48(%esp), %ecx + movl 0+16(%esp), %edx + movl %eax, 0+8(%edi) + movl %ebx, 0+40(%edi) + movl %ecx, 0+16(%edi) + movl %edx, 0+48(%edi) + movl 0+20(%esp), %eax + movl 0+4(%esp), %ebx + movl 0+52(%esp), %ecx + movl 0+36(%esp), %edx + movl %eax, 0+4(%edi) + movl %ebx, 0+20(%edi) + movl %ecx, 0+36(%edi) + movl %edx, 0+52(%edi) + movl 0+0(%esp), %eax + movl 0+24(%esp), %ebx + movl 0+32(%esp), %ecx + movl 0+56(%esp), %edx + movl %eax, 0+0(%edi) + movl %ebx, 0+24(%edi) + movl %ecx, 0+32(%edi) + movl %edx, 0+56(%edi) + movl 64+60(%esp), %eax + movl 64+44(%esp), %ebx + movl 64+28(%esp), %ecx + movl 64+12(%esp), %edx + movl %eax, 64+12(%edi) + movl %ebx, 64+28(%edi) + movl %ecx, 64+44(%edi) + movl %edx, 64+60(%edi) + movl 64+40(%esp), %eax + movl 64+8(%esp), %ebx + movl 64+48(%esp), %ecx + movl 64+16(%esp), %edx + movl %eax, 64+8(%edi) + movl %ebx, 64+40(%edi) + movl %ecx, 64+16(%edi) + movl %edx, 64+48(%edi) + movl 64+20(%esp), %eax + movl 64+4(%esp), %ebx + movl 64+52(%esp), %ecx + movl 64+36(%esp), %edx + movl %eax, 64+4(%edi) + movl %ebx, 64+20(%edi) + movl %ecx, 64+36(%edi) + movl %edx, 64+52(%edi) + movl 64+0(%esp), %eax + movl 64+24(%esp), %ebx + movl 64+32(%esp), %ecx + movl 64+56(%esp), %edx + movl %eax, 64+0(%edi) + movl %ebx, 64+24(%edi) + movl %ecx, 64+32(%edi) + movl %edx, 64+56(%edi) movl %ebp, %esp popl %esi diff --git a/scrypt-x86.S.orig b/scrypt-x86.S.orig new file mode 100644 index 000000000..5ab7eda65 --- /dev/null +++ b/scrypt-x86.S.orig @@ -0,0 +1,830 @@ +/* + * Copyright 2011-2012, 2014 pooler@litecoinpool.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(USE_ASM) && defined(__i386__) + +.macro scrypt_shuffle src, so, dest, do + movl \so+60(\src), %eax + movl \so+44(\src), %ebx + movl \so+28(\src), %ecx + movl \so+12(\src), %edx + movl %eax, \do+12(\dest) + movl %ebx, \do+28(\dest) + movl %ecx, \do+44(\dest) + movl %edx, \do+60(\dest) + movl \so+40(\src), %eax + movl \so+8(\src), %ebx + movl \so+48(\src), %ecx + movl \so+16(\src), %edx + movl %eax, \do+8(\dest) + movl %ebx, \do+40(\dest) + movl %ecx, \do+16(\dest) + movl %edx, \do+48(\dest) + movl \so+20(\src), %eax + movl \so+4(\src), %ebx + movl \so+52(\src), %ecx + movl \so+36(\src), %edx + movl %eax, \do+4(\dest) + movl %ebx, \do+20(\dest) + movl %ecx, \do+36(\dest) + movl %edx, \do+52(\dest) + movl \so+0(\src), %eax + movl \so+24(\src), %ebx + movl \so+32(\src), %ecx + movl \so+56(\src), %edx + movl %eax, \do+0(\dest) + movl %ebx, \do+24(\dest) + movl %ecx, \do+32(\dest) + movl %edx, \do+56(\dest) +.endm + +.macro salsa8_core_gen_quadround + movl 52(%esp), %ecx + movl 4(%esp), %edx + movl 20(%esp), %ebx + movl 8(%esp), %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 4(%esp) + movl 36(%esp), %edi + leal (%edx, %ebx), %ebp + roll $9, %ebp + xorl %ebp, %edi + movl 24(%esp), %ebp + movl %edi, 8(%esp) + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 40(%esp), %ebx + movl %ecx, 20(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 24(%esp) + movl 56(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 36(%esp) + movl 28(%esp), %ecx + movl %edx, 28(%esp) + movl 44(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 60(%esp), %ebx + movl %esi, 40(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 44(%esp) + movl 12(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 12(%esp) + movl 48(%esp), %esi + movl %ebp, 48(%esp) + movl 64(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl 32(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 32(%esp) + movl %ebx, %ecx + movl %edx, 52(%esp) + movl 28(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 40(%esp), %ebx + movl %esi, 28(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 40(%esp) + movl 12(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 12(%esp) + movl 4(%esp), %esi + movl %ebp, 4(%esp) + movl 48(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 48(%esp) + movl 32(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 32(%esp) + movl 24(%esp), %ecx + movl %edx, 24(%esp) + movl 52(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 28(%esp), %ebx + movl %esi, 28(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 52(%esp) + movl 8(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 8(%esp) + movl 44(%esp), %esi + movl %ebp, 44(%esp) + movl 4(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 20(%esp), %ebx + movl %ecx, 4(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl 36(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 20(%esp) + movl %ebx, %ecx + movl %edx, 36(%esp) + movl 24(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 28(%esp), %ebx + movl %esi, 24(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 28(%esp) + xorl %esi, %ebp + movl 8(%esp), %esi + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl 40(%esp), %edi + movl %ebp, 8(%esp) + movl 44(%esp), %ebp + movl %esi, 40(%esp) + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 4(%esp), %ebx + movl %ecx, 44(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 4(%esp) + movl 20(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 56(%esp) + movl 48(%esp), %ecx + movl %edx, 20(%esp) + movl 36(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 24(%esp), %ebx + movl %edi, 24(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 60(%esp) + movl 12(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 12(%esp) + movl 52(%esp), %edi + movl %ebp, 36(%esp) + movl 8(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl 32(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 32(%esp) + movl %ebx, %ecx + movl %edx, 48(%esp) + movl 20(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 24(%esp), %ebx + movl %edi, 20(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 8(%esp) + movl 12(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 12(%esp) + movl 28(%esp), %edi + movl %ebp, 52(%esp) + movl 36(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 28(%esp) + movl 32(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 32(%esp) + movl 4(%esp), %ecx + movl %edx, 4(%esp) + movl 48(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 20(%esp), %ebx + movl %edi, 20(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 48(%esp) + movl 40(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 36(%esp) + movl 60(%esp), %edi + movl %ebp, 24(%esp) + movl 52(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 44(%esp), %ebx + movl %ecx, 40(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 52(%esp) + movl 56(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 56(%esp) + addl %esi, %ebx + movl %edx, 44(%esp) + roll $13, %ebx + xorl %ebx, %edi + movl %edi, 60(%esp) + addl %esi, %edi + roll $18, %edi + xorl %edi, %ebp + movl %ebp, 64(%esp) +.endm + + .text + .p2align 5 +salsa8_core_gen: + salsa8_core_gen_quadround + salsa8_core_gen_quadround + ret + + + .text + .p2align 5 + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: + pushl %ebx + pushl %ebp + pushl %edi + pushl %esi + + /* Check for SSE2 availability */ + movl $1, %eax + cpuid + andl $0x04000000, %edx + jnz scrypt_core_sse2 + +scrypt_core_gen: + movl 20(%esp), %edi + movl 24(%esp), %esi + movl 28(%esp), %ecx + subl $72, %esp + +.macro scrypt_core_macro1a p, q + movl \p(%edi), %eax + movl \q(%edi), %edx + movl %eax, \p(%esi) + movl %edx, \q(%esi) + xorl %edx, %eax + movl %eax, \p(%edi) + movl %eax, \p(%esp) +.endm + +.macro scrypt_core_macro1b p, q + movl \p(%edi), %eax + xorl \p(%esi, %edx), %eax + movl \q(%edi), %ebx + xorl \q(%esi, %edx), %ebx + movl %ebx, \q(%edi) + xorl %ebx, %eax + movl %eax, \p(%edi) + movl %eax, \p(%esp) +.endm + +.macro scrypt_core_macro2 p, q + movl \p(%esp), %eax + addl \p(%edi), %eax + movl %eax, \p(%edi) + xorl \q(%edi), %eax + movl %eax, \q(%edi) + movl %eax, \p(%esp) +.endm + +.macro scrypt_core_macro3 p, q + movl \p(%esp), %eax + addl \q(%edi), %eax + movl %eax, \q(%edi) +.endm + + shll $7, %ecx + addl %esi, %ecx +scrypt_core_gen_loop1: + movl %esi, 64(%esp) + movl %ecx, 68(%esp) + + scrypt_core_macro1a 0, 64 + scrypt_core_macro1a 4, 68 + scrypt_core_macro1a 8, 72 + scrypt_core_macro1a 12, 76 + scrypt_core_macro1a 16, 80 + scrypt_core_macro1a 20, 84 + scrypt_core_macro1a 24, 88 + scrypt_core_macro1a 28, 92 + scrypt_core_macro1a 32, 96 + scrypt_core_macro1a 36, 100 + scrypt_core_macro1a 40, 104 + scrypt_core_macro1a 44, 108 + scrypt_core_macro1a 48, 112 + scrypt_core_macro1a 52, 116 + scrypt_core_macro1a 56, 120 + scrypt_core_macro1a 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + scrypt_core_macro2 0, 64 + scrypt_core_macro2 4, 68 + scrypt_core_macro2 8, 72 + scrypt_core_macro2 12, 76 + scrypt_core_macro2 16, 80 + scrypt_core_macro2 20, 84 + scrypt_core_macro2 24, 88 + scrypt_core_macro2 28, 92 + scrypt_core_macro2 32, 96 + scrypt_core_macro2 36, 100 + scrypt_core_macro2 40, 104 + scrypt_core_macro2 44, 108 + scrypt_core_macro2 48, 112 + scrypt_core_macro2 52, 116 + scrypt_core_macro2 56, 120 + scrypt_core_macro2 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + scrypt_core_macro3 0, 64 + scrypt_core_macro3 4, 68 + scrypt_core_macro3 8, 72 + scrypt_core_macro3 12, 76 + scrypt_core_macro3 16, 80 + scrypt_core_macro3 20, 84 + scrypt_core_macro3 24, 88 + scrypt_core_macro3 28, 92 + scrypt_core_macro3 32, 96 + scrypt_core_macro3 36, 100 + scrypt_core_macro3 40, 104 + scrypt_core_macro3 44, 108 + scrypt_core_macro3 48, 112 + scrypt_core_macro3 52, 116 + scrypt_core_macro3 56, 120 + scrypt_core_macro3 60, 124 + + movl 64(%esp), %esi + movl 68(%esp), %ecx + addl $128, %esi + cmpl %ecx, %esi + jne scrypt_core_gen_loop1 + + movl 96(%esp), %esi + movl 100(%esp), %ecx + movl %ecx, %eax + subl $1, %eax + movl %eax, 100(%esp) +scrypt_core_gen_loop2: + movl %ecx, 68(%esp) + + movl 64(%edi), %edx + andl 100(%esp), %edx + shll $7, %edx + + scrypt_core_macro1b 0, 64 + scrypt_core_macro1b 4, 68 + scrypt_core_macro1b 8, 72 + scrypt_core_macro1b 12, 76 + scrypt_core_macro1b 16, 80 + scrypt_core_macro1b 20, 84 + scrypt_core_macro1b 24, 88 + scrypt_core_macro1b 28, 92 + scrypt_core_macro1b 32, 96 + scrypt_core_macro1b 36, 100 + scrypt_core_macro1b 40, 104 + scrypt_core_macro1b 44, 108 + scrypt_core_macro1b 48, 112 + scrypt_core_macro1b 52, 116 + scrypt_core_macro1b 56, 120 + scrypt_core_macro1b 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + scrypt_core_macro2 0, 64 + scrypt_core_macro2 4, 68 + scrypt_core_macro2 8, 72 + scrypt_core_macro2 12, 76 + scrypt_core_macro2 16, 80 + scrypt_core_macro2 20, 84 + scrypt_core_macro2 24, 88 + scrypt_core_macro2 28, 92 + scrypt_core_macro2 32, 96 + scrypt_core_macro2 36, 100 + scrypt_core_macro2 40, 104 + scrypt_core_macro2 44, 108 + scrypt_core_macro2 48, 112 + scrypt_core_macro2 52, 116 + scrypt_core_macro2 56, 120 + scrypt_core_macro2 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + movl 96(%esp), %esi + scrypt_core_macro3 0, 64 + scrypt_core_macro3 4, 68 + scrypt_core_macro3 8, 72 + scrypt_core_macro3 12, 76 + scrypt_core_macro3 16, 80 + scrypt_core_macro3 20, 84 + scrypt_core_macro3 24, 88 + scrypt_core_macro3 28, 92 + scrypt_core_macro3 32, 96 + scrypt_core_macro3 36, 100 + scrypt_core_macro3 40, 104 + scrypt_core_macro3 44, 108 + scrypt_core_macro3 48, 112 + scrypt_core_macro3 52, 116 + scrypt_core_macro3 56, 120 + scrypt_core_macro3 60, 124 + + movl 68(%esp), %ecx + subl $1, %ecx + ja scrypt_core_gen_loop2 + + addl $72, %esp + popl %esi + popl %edi + popl %ebp + popl %ebx + ret + + +.macro salsa8_core_sse2_doubleround + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 +.endm + +.macro salsa8_core_sse2 + salsa8_core_sse2_doubleround + salsa8_core_sse2_doubleround + salsa8_core_sse2_doubleround + salsa8_core_sse2_doubleround +.endm + + .p2align 5 +scrypt_core_sse2: + movl 20(%esp), %edi + movl 24(%esp), %esi + movl %esp, %ebp + subl $128, %esp + andl $-16, %esp + + scrypt_shuffle %edi, 0, %esp, 0 + scrypt_shuffle %edi, 64, %esp, 64 + + movdqa 96(%esp), %xmm6 + movdqa 112(%esp), %xmm7 + + movl %esi, %edx + movl 28(%ebp), %ecx + shll $7, %ecx + addl %esi, %ecx +scrypt_core_sse2_loop1: + movdqa 0(%esp), %xmm0 + movdqa 16(%esp), %xmm1 + movdqa 32(%esp), %xmm2 + movdqa 48(%esp), %xmm3 + movdqa 64(%esp), %xmm4 + movdqa 80(%esp), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, 0(%edx) + movdqa %xmm1, 16(%edx) + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm2, 32(%edx) + movdqa %xmm3, 48(%edx) + movdqa %xmm4, 64(%edx) + movdqa %xmm5, 80(%edx) + movdqa %xmm6, 96(%edx) + movdqa %xmm7, 112(%edx) + + salsa8_core_sse2 + paddd 0(%edx), %xmm0 + paddd 16(%edx), %xmm1 + paddd 32(%edx), %xmm2 + paddd 48(%edx), %xmm3 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + + pxor 64(%esp), %xmm0 + pxor 80(%esp), %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + salsa8_core_sse2 + paddd 64(%esp), %xmm0 + paddd 80(%esp), %xmm1 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + + addl $128, %edx + cmpl %ecx, %edx + jne scrypt_core_sse2_loop1 + + movdqa 64(%esp), %xmm4 + movdqa 80(%esp), %xmm5 + + movl 28(%ebp), %ecx + movl %ecx, %eax + subl $1, %eax +scrypt_core_sse2_loop2: + movd %xmm4, %edx + movdqa 0(%esp), %xmm0 + movdqa 16(%esp), %xmm1 + movdqa 32(%esp), %xmm2 + movdqa 48(%esp), %xmm3 + andl %eax, %edx + shll $7, %edx + pxor 0(%esi, %edx), %xmm0 + pxor 16(%esi, %edx), %xmm1 + pxor 32(%esi, %edx), %xmm2 + pxor 48(%esi, %edx), %xmm3 + + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + salsa8_core_sse2 + paddd 0(%esp), %xmm0 + paddd 16(%esp), %xmm1 + paddd 32(%esp), %xmm2 + paddd 48(%esp), %xmm3 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + + pxor 64(%esi, %edx), %xmm0 + pxor 80(%esi, %edx), %xmm1 + pxor 96(%esi, %edx), %xmm2 + pxor 112(%esi, %edx), %xmm3 + pxor 64(%esp), %xmm0 + pxor 80(%esp), %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + salsa8_core_sse2 + paddd 64(%esp), %xmm0 + paddd 80(%esp), %xmm1 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm0, %xmm4 + movdqa %xmm1, %xmm5 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + + subl $1, %ecx + ja scrypt_core_sse2_loop2 + + movdqa %xmm6, 96(%esp) + movdqa %xmm7, 112(%esp) + + scrypt_shuffle %esp, 0, %edi, 0 + scrypt_shuffle %esp, 64, %edi, 64 + + movl %ebp, %esp + popl %esi + popl %edi + popl %ebp + popl %ebx + ret + +#endif diff --git a/sha2-arm.S b/sha2-arm.S index bd7fdc5cb..f77a3e750 100644 --- a/sha2-arm.S +++ b/sha2-arm.S @@ -11,98 +11,12 @@ #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) -.macro sha256_k - .align 2 - .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -.endm - -.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz - mov r12, \ry, ror #17 - add r11, r11, \ra - eor r12, r12, \ry, ror #19 - mov \ra, lr, ror #7 - eor r12, r12, \ry, lsr #10 - eor \ra, \ra, lr, ror #18 - add r12, r12, r11 - ldr r11, [\rw, #(\i+2)*4] - eor \ra, \ra, lr, lsr #3 - add \ra, \ra, r12 - - mov r12, \rz, ror #17 - str \ra, [\rw, #(\i+16)*4] - add lr, lr, \rb - eor r12, r12, \rz, ror #19 - mov \rb, r11, ror #7 - eor r12, r12, \rz, lsr #10 - eor \rb, \rb, r11, ror #18 - add lr, lr, r12 - eor \rb, \rb, r11, lsr #3 - add \rb, \rb, lr -.endm - -.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz - ldr lr, [\rw, #(\i+1)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - ldr lr, [\rw, #(\i+3)*4] -.endm - -.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz - str \rz, [\rw, #(\i+15)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - ldr lr, [\rw, #(\i+3)*4] -.endm - -.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz - str \rz, [\rw, #(\i+15)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - str \rb, [\rw, #(\i+17)*4] -.endm - -.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh - ldr r12, [\rw, #(\i)*4] - and r3, \rf, \re - bic lr, \rg, \re - orr lr, lr, r3 - ldr r3, \ka + (\i)*4 - add \rh, \rh, lr - eor lr, \re, \re, ror #5 - add \rh, \rh, r12 - eor lr, lr, \re, ror #19 - add \rh, \rh, r3 - eor r3, \ra, \rb - add \rh, \rh, lr, ror #6 - - and r3, r3, \rc - eor r12, \ra, \ra, ror #11 - and lr, \ra, \rb - eor r12, r12, \ra, ror #20 - eor lr, lr, r3 - add r3, \rh, lr - add \rh, \rh, \rd - add \rd, r3, r12, ror #2 -.endm - -.macro sha256_main_quadround i, ka, rw - sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8 -.endm + + + + + + .text @@ -127,1084 +41,11457 @@ _sha256_transform: stmia r3, {r4-r11} b sha256_transform_extend -.macro bswap rd, rn - eor r12, \rn, \rn, ror #16 - bic r12, r12, #0x00ff0000 - mov \rd, \rn, ror #8 - eor \rd, \rd, r12, lsr #8 -.endm sha256_transform_swap: ldmia r1!, {r4-r11} - bswap r4, r4 - bswap r5, r5 - bswap r6, r6 - bswap r7, r7 - bswap r8, r8 - bswap r9, r9 - bswap r10, r10 - bswap r11, r11 + eor r12, r4, r4, ror #16 + bic r12, r12, #0x00ff0000 + mov r4, r4, ror #8 + eor r4, r4, r12, lsr #8 + eor r12, r5, r5, ror #16 + bic r12, r12, #0x00ff0000 + mov r5, r5, ror #8 + eor r5, r5, r12, lsr #8 + eor r12, r6, r6, ror #16 + bic r12, r12, #0x00ff0000 + mov r6, r6, ror #8 + eor r6, r6, r12, lsr #8 + eor r12, r7, r7, ror #16 + bic r12, r12, #0x00ff0000 + mov r7, r7, ror #8 + eor r7, r7, r12, lsr #8 + eor r12, r8, r8, ror #16 + bic r12, r12, #0x00ff0000 + mov r8, r8, ror #8 + eor r8, r8, r12, lsr #8 + eor r12, r9, r9, ror #16 + bic r12, r12, #0x00ff0000 + mov r9, r9, ror #8 + eor r9, r9, r12, lsr #8 + eor r12, r10, r10, ror #16 + bic r12, r12, #0x00ff0000 + mov r10, r10, ror #8 + eor r10, r10, r12, lsr #8 + eor r12, r11, r11, ror #16 + bic r12, r12, #0x00ff0000 + mov r11, r11, ror #8 + eor r11, r11, r12, lsr #8 stmia sp, {r4-r11} add r3, sp, #8*4 ldmia r1, {r4-r11} - bswap r4, r4 - bswap r5, r5 - bswap r6, r6 - bswap r7, r7 - bswap r8, r8 - bswap r9, r9 - bswap r10, r10 - bswap r11, r11 + eor r12, r4, r4, ror #16 + bic r12, r12, #0x00ff0000 + mov r4, r4, ror #8 + eor r4, r4, r12, lsr #8 + eor r12, r5, r5, ror #16 + bic r12, r12, #0x00ff0000 + mov r5, r5, ror #8 + eor r5, r5, r12, lsr #8 + eor r12, r6, r6, ror #16 + bic r12, r12, #0x00ff0000 + mov r6, r6, ror #8 + eor r6, r6, r12, lsr #8 + eor r12, r7, r7, ror #16 + bic r12, r12, #0x00ff0000 + mov r7, r7, ror #8 + eor r7, r7, r12, lsr #8 + eor r12, r8, r8, ror #16 + bic r12, r12, #0x00ff0000 + mov r8, r8, ror #8 + eor r8, r8, r12, lsr #8 + eor r12, r9, r9, ror #16 + bic r12, r12, #0x00ff0000 + mov r9, r9, ror #8 + eor r9, r9, r12, lsr #8 + eor r12, r10, r10, ror #16 + bic r12, r12, #0x00ff0000 + mov r10, r10, ror #8 + eor r10, r10, r12, lsr #8 + eor r12, r11, r11, ror #16 + bic r12, r12, #0x00ff0000 + mov r11, r11, ror #8 + eor r11, r11, r12, lsr #8 stmia r3, {r4-r11} sha256_transform_extend: add r12, sp, #9*4 ldr r11, [sp, #0*4] ldmia r12, {r4-r10} - sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5 - sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7 - - ldmia r0, {r4-r11} - sha256_main_quadround 0, sha256_transform_k, sp - sha256_main_quadround 4, sha256_transform_k, sp - sha256_main_quadround 8, sha256_transform_k, sp - sha256_main_quadround 12, sha256_transform_k, sp - sha256_main_quadround 16, sha256_transform_k, sp - sha256_main_quadround 20, sha256_transform_k, sp - sha256_main_quadround 24, sha256_transform_k, sp - sha256_main_quadround 28, sha256_transform_k, sp - b sha256_transform_k_over -sha256_transform_k: - sha256_k -sha256_transform_k_over: - sha256_main_quadround 32, sha256_transform_k, sp - sha256_main_quadround 36, sha256_transform_k, sp - sha256_main_quadround 40, sha256_transform_k, sp - sha256_main_quadround 44, sha256_transform_k, sp - sha256_main_quadround 48, sha256_transform_k, sp - sha256_main_quadround 52, sha256_transform_k, sp - sha256_main_quadround 56, sha256_transform_k, sp - sha256_main_quadround 60, sha256_transform_k, sp - - ldmia r0, {r1, r2, r3, r12} - add r4, r4, r1 - add r5, r5, r2 - add r6, r6, r3 - add r7, r7, r12 - stmia r0!, {r4-r7} - ldmia r0, {r1, r2, r3, r12} - add r8, r8, r1 - add r9, r9, r2 - add r10, r10, r3 - add r11, r11, r12 - stmia r0, {r8-r11} - - add sp, sp, #64*4 -#ifdef __thumb__ - ldmfd sp!, {r4-r11, lr} - bx lr -#else - ldmfd sp!, {r4-r11, pc} -#endif - + ldr lr, [sp, #(0+1)*4] + mov r12, r9, ror #17 + add r11, r11, r4 + eor r12, r12, r9, ror #19 + mov r4, lr, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(0+2)*4] + eor r4, r4, lr, lsr #3 + add r4, r4, r12 - .text - .code 32 - .align 2 - .globl sha256d_ms - .globl _sha256d_ms -#ifdef __ELF__ - .type sha256d_ms, %function -#endif -sha256d_ms: -_sha256d_ms: - stmfd sp!, {r4-r11, lr} - sub sp, sp, #64*4 - - cmp r0, r0 - - ldr lr, [r1, #3*4] - ldr r6, [r1, #18*4] - ldr r7, [r1, #19*4] - - mov r12, lr, ror #7 - str r6, [sp, #18*4] - eor r12, r12, lr, ror #18 - str r7, [sp, #19*4] - eor r12, r12, lr, lsr #3 - ldr r8, [r1, #20*4] + mov r12, r10, ror #17 + str r4, [sp, #(0+16)*4] + add lr, lr, r5 + eor r12, r12, r10, ror #19 + mov r5, r11, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, r11, ror #18 + add lr, lr, r12 + eor r5, r5, r11, lsr #3 + add r5, r5, lr + ldr lr, [sp, #(0+3)*4] + str r5, [sp, #(2+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(2+2)*4] + eor r6, r6, lr, lsr #3 add r6, r6, r12 - ldr r10, [r1, #22*4] + + mov r12, r5, ror #17 + str r6, [sp, #(2+16)*4] + add lr, lr, r7 + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 add r7, r7, lr - str r6, [r1, #18*4] - + ldr lr, [sp, #(2+3)*4] + str r7, [sp, #(4+15)*4] mov r12, r6, ror #17 - str r7, [r1, #19*4] + add r11, r11, r8 eor r12, r12, r6, ror #19 - str r8, [sp, #20*4] + mov r8, lr, ror #7 eor r12, r12, r6, lsr #10 - ldr r4, [r1, #23*4] + eor r8, r8, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(4+2)*4] + eor r8, r8, lr, lsr #3 add r8, r8, r12 - ldr r5, [r1, #24*4] - - mov r9, r7, ror #17 - str r8, [r1, #20*4] - eor r9, r9, r7, ror #19 - str r10, [sp, #21*4] - eor r9, r9, r7, lsr #10 - str r4, [sp, #22*4] - + + mov r12, r7, ror #17 + str r8, [sp, #(4+16)*4] + add lr, lr, r9 + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + add r9, r9, lr + ldr lr, [sp, #(4+3)*4] + str r9, [sp, #(6+15)*4] mov r12, r8, ror #17 - str r9, [r1, #21*4] + add r11, r11, r10 eor r12, r12, r8, ror #19 - str r5, [sp, #23*4] + mov r10, lr, ror #7 eor r12, r12, r8, lsr #10 - mov lr, r9, ror #17 + eor r10, r10, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(6+2)*4] + eor r10, r10, lr, lsr #3 add r10, r10, r12 - ldr r11, [r1, #30*4] - - eor lr, lr, r9, ror #19 - str r10, [r1, #22*4] - eor lr, lr, r9, lsr #10 - str r11, [sp, #24*4] + + mov r12, r9, ror #17 + str r10, [sp, #(6+16)*4] + add lr, lr, r4 + eor r12, r12, r9, ror #19 + mov r4, r11, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, r11, ror #18 + add lr, lr, r12 + eor r4, r4, r11, lsr #3 add r4, r4, lr - + ldr lr, [sp, #(6+3)*4] + str r4, [sp, #(8+15)*4] mov r12, r10, ror #17 - str r4, [r1, #23*4] + add r11, r11, r5 eor r12, r12, r10, ror #19 - mov lr, r4, ror #17 + mov r5, lr, ror #7 eor r12, r12, r10, lsr #10 - eor lr, lr, r4, ror #19 + eor r5, r5, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(8+2)*4] + eor r5, r5, lr, lsr #3 add r5, r5, r12 - eor lr, lr, r4, lsr #10 - str r5, [r1, #24*4] + + mov r12, r4, ror #17 + str r5, [sp, #(8+16)*4] + add lr, lr, r6 + eor r12, r12, r4, ror #19 + mov r6, r11, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, r11, ror #18 + add lr, lr, r12 + eor r6, r6, r11, lsr #3 add r6, r6, lr - + ldr lr, [sp, #(8+3)*4] + str r6, [sp, #(10+15)*4] mov r12, r5, ror #17 - str r6, [r1, #25*4] + add r11, r11, r7 eor r12, r12, r5, ror #19 - mov lr, r6, ror #17 + mov r7, lr, ror #7 eor r12, r12, r5, lsr #10 - eor lr, lr, r6, ror #19 + eor r7, r7, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(10+2)*4] + eor r7, r7, lr, lsr #3 add r7, r7, r12 - eor lr, lr, r6, lsr #10 - str r7, [r1, #26*4] + + mov r12, r6, ror #17 + str r7, [sp, #(10+16)*4] + add lr, lr, r8 + eor r12, r12, r6, ror #19 + mov r8, r11, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, r11, ror #18 + add lr, lr, r12 + eor r8, r8, r11, lsr #3 add r8, r8, lr - + ldr lr, [sp, #(10+3)*4] + str r8, [sp, #(12+15)*4] mov r12, r7, ror #17 - str r8, [r1, #27*4] + add r11, r11, r9 eor r12, r12, r7, ror #19 - mov lr, r8, ror #17 + mov r9, lr, ror #7 eor r12, r12, r7, lsr #10 - eor lr, lr, r8, ror #19 + eor r9, r9, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(12+2)*4] + eor r9, r9, lr, lsr #3 add r9, r9, r12 - eor lr, lr, r8, lsr #10 - str r9, [r1, #28*4] + + mov r12, r8, ror #17 + str r9, [sp, #(12+16)*4] + add lr, lr, r10 + eor r12, r12, r8, ror #19 + mov r10, r11, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, r11, ror #18 + add lr, lr, r12 + eor r10, r10, r11, lsr #3 add r10, r10, lr - - ldr lr, [r1, #31*4] + ldr lr, [sp, #(12+3)*4] + str r10, [sp, #(14+15)*4] mov r12, r9, ror #17 - str r10, [r1, #29*4] + add r11, r11, r4 eor r12, r12, r9, ror #19 - str lr, [sp, #25*4] + mov r4, lr, ror #7 eor r12, r12, r9, lsr #10 - add r11, r11, r12 - add r5, r5, lr + eor r4, r4, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(14+2)*4] + eor r4, r4, lr, lsr #3 + add r4, r4, r12 + mov r12, r10, ror #17 - add r4, r4, r11 - - ldr r11, [r1, #16*4] + str r4, [sp, #(14+16)*4] + add lr, lr, r5 eor r12, r12, r10, ror #19 - str r4, [r1, #30*4] + mov r5, r11, ror #7 eor r12, r12, r10, lsr #10 - add r5, r5, r12 - ldr lr, [r1, #17*4] - -sha256d_ms_extend_loop2: - sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5 - sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7 - sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9 - sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4 - sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6 - sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8 - sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10 - sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5 - sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7 - sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9 - sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4 - sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6 - sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8 - sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10 - bne sha256d_ms_extend_coda2 - sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5 - sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7 - - ldr r4, [r3, #0*4] - ldr r9, [r3, #1*4] - ldr r10, [r3, #2*4] - ldr r11, [r3, #3*4] - ldr r8, [r3, #4*4] - ldr r5, [r3, #5*4] - ldr r6, [r3, #6*4] - ldr r7, [r3, #7*4] - b sha256d_ms_main_loop1 - -sha256d_ms_main_loop2: - sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 -sha256d_ms_main_loop1: - sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 4, sha256d_ms_k, r1 - sha256_main_quadround 8, sha256d_ms_k, r1 - sha256_main_quadround 12, sha256d_ms_k, r1 - sha256_main_quadround 16, sha256d_ms_k, r1 - sha256_main_quadround 20, sha256d_ms_k, r1 - sha256_main_quadround 24, sha256d_ms_k, r1 - sha256_main_quadround 28, sha256d_ms_k, r1 - b sha256d_ms_k_over -sha256d_ms_k: - sha256_k -sha256d_ms_k_over: - sha256_main_quadround 32, sha256d_ms_k, r1 - sha256_main_quadround 36, sha256d_ms_k, r1 - sha256_main_quadround 40, sha256d_ms_k, r1 - sha256_main_quadround 44, sha256d_ms_k, r1 - sha256_main_quadround 48, sha256d_ms_k, r1 - sha256_main_quadround 52, sha256d_ms_k, r1 - sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 - bne sha256d_ms_finish - sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 60, sha256d_ms_k, r1 - - ldmia r2!, {r3, r12, lr} - add r4, r4, r3 - add r5, r5, r12 - add r6, r6, lr - stmia sp, {r4-r6} - ldmia r2, {r3, r4, r5, r6, r12} - add lr, sp, #3*4 - add r7, r7, r3 - add r8, r8, r4 - add r9, r9, r5 - add r10, r10, r6 - add r11, r11, r12 - add r12, sp, #18*4 - stmia lr!, {r7-r11} - - ldmia r12, {r4-r11} - str r4, [r1, #18*4] - str r5, [r1, #19*4] - str r6, [r1, #20*4] - str r7, [r1, #22*4] - str r8, [r1, #23*4] - str r9, [r1, #24*4] - str r10, [r1, #30*4] - str r11, [r1, #31*4] - - mov r3, #0x80000000 - mov r4, #0 - mov r5, #0 - mov r6, #0 - mov r7, #0 - mov r8, #0 - mov r9, #0 - mov r10, #0x00000100 - stmia lr, {r3-r10} - - ldr lr, [sp, #1*4] - movs r1, sp - ldr r4, [sp, #0*4] - - ldr r11, [sp, #2*4] - mov r12, lr, ror #7 - eor r12, r12, lr, ror #18 - add r5, lr, #0x00a00000 - eor r12, r12, lr, lsr #3 - mov lr, r11, ror #7 - add r4, r4, r12 - eor lr, lr, r11, ror #18 - str r4, [sp, #16*4] - eor lr, lr, r11, lsr #3 - mov r12, r4, ror #17 + eor r5, r5, r11, ror #18 + add lr, lr, r12 + eor r5, r5, r11, lsr #3 add r5, r5, lr - ldr lr, [sp, #3*4] - - str r5, [sp, #17*4] + ldr lr, [sp, #(14+3)*4] + str r5, [sp, #(16+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 eor r12, r12, r4, ror #19 mov r6, lr, ror #7 eor r12, r12, r4, lsr #10 eor r6, r6, lr, ror #18 - add r11, r11, r12 + add r12, r12, r11 + ldr r11, [sp, #(16+2)*4] eor r6, r6, lr, lsr #3 + add r6, r6, r12 + mov r12, r5, ror #17 - add r6, r6, r11 - ldr r11, [sp, #4*4] - - str r6, [sp, #18*4] + str r6, [sp, #(16+16)*4] + add lr, lr, r7 eor r12, r12, r5, ror #19 mov r7, r11, ror #7 eor r12, r12, r5, lsr #10 eor r7, r7, r11, ror #18 add lr, lr, r12 eor r7, r7, r11, lsr #3 - mov r12, r6, ror #17 add r7, r7, lr - ldr lr, [sp, #5*4] - - str r7, [sp, #19*4] + ldr lr, [sp, #(16+3)*4] + str r7, [sp, #(18+15)*4] + mov r12, r6, ror #17 + add r11, r11, r8 eor r12, r12, r6, ror #19 mov r8, lr, ror #7 eor r12, r12, r6, lsr #10 eor r8, r8, lr, ror #18 - add r11, r11, r12 + add r12, r12, r11 + ldr r11, [sp, #(18+2)*4] eor r8, r8, lr, lsr #3 + add r8, r8, r12 + mov r12, r7, ror #17 - add r8, r8, r11 - ldr r11, [sp, #6*4] - - str r8, [sp, #20*4] + str r8, [sp, #(18+16)*4] + add lr, lr, r9 eor r12, r12, r7, ror #19 mov r9, r11, ror #7 eor r12, r12, r7, lsr #10 eor r9, r9, r11, ror #18 add lr, lr, r12 eor r9, r9, r11, lsr #3 - mov r12, r8, ror #17 add r9, r9, lr - ldr lr, [sp, #7*4] - - str r9, [sp, #21*4] + ldr lr, [sp, #(18+3)*4] + str r9, [sp, #(20+15)*4] + mov r12, r8, ror #17 + add r11, r11, r10 eor r12, r12, r8, ror #19 mov r10, lr, ror #7 eor r12, r12, r8, lsr #10 eor r10, r10, lr, ror #18 - add r11, r11, r12 + add r12, r12, r11 + ldr r11, [sp, #(20+2)*4] eor r10, r10, lr, lsr #3 + add r10, r10, r12 + mov r12, r9, ror #17 - add r11, r11, #0x00000100 + str r10, [sp, #(20+16)*4] add lr, lr, r4 - add r10, r10, r11 - eor r12, r12, r9, ror #19 - str r10, [sp, #22*4] - add lr, lr, #0x11000000 + mov r4, r11, ror #7 eor r12, r12, r9, lsr #10 + eor r4, r4, r11, ror #18 add lr, lr, r12 + eor r4, r4, r11, lsr #3 + add r4, r4, lr + ldr lr, [sp, #(20+3)*4] + str r4, [sp, #(22+15)*4] mov r12, r10, ror #17 - add r4, lr, #0x00002000 + add r11, r11, r5 eor r12, r12, r10, ror #19 - str r4, [sp, #23*4] - add r5, r5, #0x80000000 + mov r5, lr, ror #7 eor r12, r12, r10, lsr #10 + eor r5, r5, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(22+2)*4] + eor r5, r5, lr, lsr #3 add r5, r5, r12 mov r12, r4, ror #17 - str r5, [sp, #24*4] + str r5, [sp, #(22+16)*4] + add lr, lr, r6 eor r12, r12, r4, ror #19 - mov r11, r5, ror #17 + mov r6, r11, ror #7 eor r12, r12, r4, lsr #10 - eor r11, r11, r5, ror #19 - add r6, r6, r12 - eor r11, r11, r5, lsr #10 - str r6, [sp, #25*4] - add r7, r7, r11 - + eor r6, r6, r11, ror #18 + add lr, lr, r12 + eor r6, r6, r11, lsr #3 + add r6, r6, lr + ldr lr, [sp, #(22+3)*4] + str r6, [sp, #(24+15)*4] + mov r12, r5, ror #17 + add r11, r11, r7 + eor r12, r12, r5, ror #19 + mov r7, lr, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(24+2)*4] + eor r7, r7, lr, lsr #3 + add r7, r7, r12 + mov r12, r6, ror #17 - str r7, [sp, #26*4] + str r7, [sp, #(24+16)*4] + add lr, lr, r8 eor r12, r12, r6, ror #19 - mov r11, r7, ror #17 + mov r8, r11, ror #7 eor r12, r12, r6, lsr #10 - eor r11, r11, r7, ror #19 - add r8, r8, r12 - eor r11, r11, r7, lsr #10 - str r8, [sp, #27*4] - add r9, r9, r11 - - mov lr, r8, ror #17 + eor r8, r8, r11, ror #18 + add lr, lr, r12 + eor r8, r8, r11, lsr #3 + add r8, r8, lr + ldr lr, [sp, #(24+3)*4] + str r8, [sp, #(26+15)*4] + mov r12, r7, ror #17 + add r11, r11, r9 + eor r12, r12, r7, ror #19 + mov r9, lr, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(26+2)*4] + eor r9, r9, lr, lsr #3 + add r9, r9, r12 + + mov r12, r8, ror #17 + str r9, [sp, #(26+16)*4] + add lr, lr, r10 + eor r12, r12, r8, ror #19 + mov r10, r11, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, r11, ror #18 + add lr, lr, r12 + eor r10, r10, r11, lsr #3 + add r10, r10, lr + ldr lr, [sp, #(26+3)*4] + str r10, [sp, #(28+15)*4] mov r12, r9, ror #17 - str r9, [sp, #28*4] - add r4, r4, #0x00400000 - eor lr, lr, r8, ror #19 + add r11, r11, r4 eor r12, r12, r9, ror #19 - eor lr, lr, r8, lsr #10 + mov r4, lr, ror #7 eor r12, r12, r9, lsr #10 - add r4, r4, #0x00000022 - add r10, r10, lr + eor r4, r4, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(28+2)*4] + eor r4, r4, lr, lsr #3 add r4, r4, r12 - ldr r11, [sp, #16*4] - - add r5, r5, #0x00000100 - str r4, [sp, #30*4] - mov lr, r11, ror #7 - str r10, [sp, #29*4] + mov r12, r10, ror #17 - eor lr, lr, r11, ror #18 + str r4, [sp, #(28+16)*4] + add lr, lr, r5 eor r12, r12, r10, ror #19 - eor lr, lr, r11, lsr #3 + mov r5, r11, ror #7 eor r12, r12, r10, lsr #10 + eor r5, r5, r11, ror #18 + add lr, lr, r12 + eor r5, r5, r11, lsr #3 add r5, r5, lr - ldr lr, [r1, #17*4] - add r5, r5, r12 - - b sha256d_ms_extend_loop2 - -sha256d_ms_extend_coda2: - str r5, [r1, #(44+15)*4] + ldr lr, [sp, #(28+3)*4] + str r5, [sp, #(30+15)*4] mov r12, r4, ror #17 add r11, r11, r6 - mov r6, lr, ror #7 eor r12, r12, r4, ror #19 - eor r6, r6, lr, ror #18 + mov r6, lr, ror #7 eor r12, r12, r4, lsr #10 - eor r6, r6, lr, lsr #3 + eor r6, r6, lr, ror #18 add r12, r12, r11 + ldr r11, [sp, #(30+2)*4] + eor r6, r6, lr, lsr #3 add r6, r6, r12 - str r6, [r1, #(44+16)*4] - - adr r2, sha256d_ms_h - ldmia r2, {r4-r11} - b sha256d_ms_main_loop2 -sha256d_ms_h: - .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a - .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + mov r12, r5, ror #17 + str r6, [sp, #(30+16)*4] + add lr, lr, r7 + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + add r7, r7, lr + ldr lr, [sp, #(30+3)*4] + str r7, [sp, #(32+15)*4] + mov r12, r6, ror #17 + add r11, r11, r8 + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(32+2)*4] + eor r8, r8, lr, lsr #3 + add r8, r8, r12 -.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh - ldr r12, [\rw, #(\i)*4] - and r3, \rf, \re - bic lr, \rg, \re - add \rh, \rh, \rd - orr lr, lr, r3 - ldr r3, \ka + (\i)*4 - add \rh, \rh, lr - eor lr, \re, \re, ror #5 - add \rh, \rh, r12 - eor lr, lr, \re, ror #19 - add \rh, \rh, r3 - add \rh, \rh, lr, ror #6 -.endm - -sha256d_ms_finish: - sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10 - sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9 - sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8 - ldr r5, [r2, #7*4] - sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11 - + mov r12, r7, ror #17 + str r8, [sp, #(32+16)*4] + add lr, lr, r9 + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + add r9, r9, lr + ldr lr, [sp, #(32+3)*4] + str r9, [sp, #(34+15)*4] + mov r12, r8, ror #17 + add r11, r11, r10 + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(34+2)*4] + eor r10, r10, lr, lsr #3 + add r10, r10, r12 + + mov r12, r9, ror #17 + str r10, [sp, #(34+16)*4] + add lr, lr, r4 + eor r12, r12, r9, ror #19 + mov r4, r11, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, r11, ror #18 + add lr, lr, r12 + eor r4, r4, r11, lsr #3 + add r4, r4, lr + ldr lr, [sp, #(34+3)*4] + str r4, [sp, #(36+15)*4] + mov r12, r10, ror #17 add r11, r11, r5 - str r11, [r0, #7*4] - - add sp, sp, #64*4 -#ifdef __thumb__ - ldmfd sp!, {r4-r11, lr} - bx lr -#else - ldmfd sp!, {r4-r11, pc} -#endif + eor r12, r12, r10, ror #19 + mov r5, lr, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(36+2)*4] + eor r5, r5, lr, lsr #3 + add r5, r5, r12 + mov r12, r4, ror #17 + str r5, [sp, #(36+16)*4] + add lr, lr, r6 + eor r12, r12, r4, ror #19 + mov r6, r11, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, r11, ror #18 + add lr, lr, r12 + eor r6, r6, r11, lsr #3 + add r6, r6, lr + ldr lr, [sp, #(36+3)*4] + str r6, [sp, #(38+15)*4] + mov r12, r5, ror #17 + add r11, r11, r7 + eor r12, r12, r5, ror #19 + mov r7, lr, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(38+2)*4] + eor r7, r7, lr, lsr #3 + add r7, r7, r12 -#ifdef __ARM_NEON__ + mov r12, r6, ror #17 + str r7, [sp, #(38+16)*4] + add lr, lr, r8 + eor r12, r12, r6, ror #19 + mov r8, r11, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, r11, ror #18 + add lr, lr, r12 + eor r8, r8, r11, lsr #3 + add r8, r8, lr + ldr lr, [sp, #(38+3)*4] + str r8, [sp, #(40+15)*4] + mov r12, r7, ror #17 + add r11, r11, r9 + eor r12, r12, r7, ror #19 + mov r9, lr, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(40+2)*4] + eor r9, r9, lr, lsr #3 + add r9, r9, r12 - .text - .code 32 - .align 2 - .globl sha256_init_4way - .globl _sha256_init_4way -#ifdef __ELF__ - .type sha256_init_4way, %function -#endif -sha256_init_4way: -_sha256_init_4way: - adr r12, sha256_4h - vldmia r12, {q8-q15} - vstmia r0, {q8-q15} - bx lr - .align 4 -sha256_4h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + mov r12, r8, ror #17 + str r9, [sp, #(40+16)*4] + add lr, lr, r10 + eor r12, r12, r8, ror #19 + mov r10, r11, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, r11, ror #18 + add lr, lr, r12 + eor r10, r10, r11, lsr #3 + add r10, r10, lr + ldr lr, [sp, #(40+3)*4] + str r10, [sp, #(42+15)*4] + mov r12, r9, ror #17 + add r11, r11, r4 + eor r12, r12, r9, ror #19 + mov r4, lr, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(42+2)*4] + eor r4, r4, lr, lsr #3 + add r4, r4, r12 -.macro sha256_4k - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 -.endm + mov r12, r10, ror #17 + str r4, [sp, #(42+16)*4] + add lr, lr, r5 + eor r12, r12, r10, ror #19 + mov r5, r11, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, r11, ror #18 + add lr, lr, r12 + eor r5, r5, r11, lsr #3 + add r5, r5, lr + ldr lr, [sp, #(42+3)*4] + str r5, [sp, #(44+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(44+2)*4] + eor r6, r6, lr, lsr #3 + add r6, r6, r12 -.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz - vadd.u32 q5, q5, \ra - veor.u32 q4, q4, q0 - vshr.u32 q0, \ry, #19 - vshl.u32 q1, \ry, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 \ra, q6, #7 - vshl.u32 q0, q6, #32-7 - veor.u32 q4, q4, q1 - veor.u32 \ra, \ra, q0 - vshr.u32 q1, \ry, #10 - vshr.u32 q0, q6, #18 - veor.u32 q4, q4, q1 - veor.u32 \ra, \ra, q0 - vshl.u32 q1, q6, #32-18 - vshr.u32 q0, q6, #3 - veor.u32 \ra, \ra, q1 - vadd.u32 q4, q4, q5 - veor.u32 \ra, \ra, q0 - vld1.u32 {q5}, [\rr]! - vadd.u32 \ra, \ra, q4 + mov r12, r5, ror #17 + str r6, [sp, #(44+16)*4] + add lr, lr, r7 + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + add r7, r7, lr + ldr lr, [sp, #(44+3)*4] + str r7, [sp, #(46+15)*4] + mov r12, r6, ror #17 + add r11, r11, r8 + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r12, r12, r11 + ldr r11, [sp, #(46+2)*4] + eor r8, r8, lr, lsr #3 + add r8, r8, r12 - vshr.u32 q4, \rz, #17 - vshl.u32 q0, \rz, #32-17 - vadd.u32 q6, q6, \rb - vst1.u32 {\ra}, [\rw]! - veor.u32 q4, q4, q0 - vshr.u32 q0, \rz, #19 - vshl.u32 q1, \rz, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 \rb, q5, #7 - veor.u32 q4, q4, q1 - vshl.u32 q0, q5, #32-7 - vshr.u32 q1, \rz, #10 - veor.u32 \rb, \rb, q0 - vshr.u32 q0, q5, #18 - veor.u32 q4, q4, q1 - veor.u32 \rb, \rb, q0 - vshl.u32 q1, q5, #32-18 - vshr.u32 q0, q5, #3 - veor.u32 \rb, \rb, q1 - vadd.u32 q1, q6, q4 - veor.u32 \rb, \rb, q0 -.endm + mov r12, r7, ror #17 + str r8, [sp, #(46+16)*4] + add lr, lr, r9 + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + add r9, r9, lr + str r9, [sp, #(46+17)*4] + + ldmia r0, {r4-r11} + ldr r12, [sp, #(0+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (0+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 -.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz - vld1.u32 {q6}, [\rr]! - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vld1.u32 {q6}, [\rr]! - vadd.u32 \rb, \rb, q1 -.endm + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(0+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (0+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 -.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - vst1.u32 {\rz}, [\rw]! - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vld1.u32 {q6}, [\rr]! - vadd.u32 \rb, \rb, q1 -.endm + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(0+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (0+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 -.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - vst1.u32 {\rz}, [\rw]! - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vadd.u32 \rb, \rb, q1 - vst1.u32 {\rb}, [\rw]! -.endm + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(0+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (0+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 -.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh - vld1.u32 {q8}, [\rw]! - vand.u32 q9, \rf, \re - vbic.u32 q10, \rg, \re - vshr.u32 q11, \re, #5 - vorr.u32 q10, q10, q9 - vld1.u32 {q9}, [\rk]! - vadd.u32 \rh, \rh, q10 - vshl.u32 q12, \re, #32-5 - veor.u32 q10, \re, q11 - vshr.u32 q11, \re, #19 - veor.u32 q10, q10, q12 - vshl.u32 q12, \re, #32-19 - veor.u32 q10, q10, q11 - vadd.u32 \rh, \rh, q8 - veor.u32 q10, q10, q12 - vadd.u32 \rh, \rh, q9 - veor.u32 q9, \ra, \rb - vshr.u32 q11, q10, #6 - vshl.u32 q13, q10, #32-6 - vadd.u32 \rh, \rh, q11 + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(4+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (4+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 - vshr.u32 q11, \ra, #11 - vshl.u32 q12, \ra, #32-11 - veor.u32 q8, \ra, q11 - vand.u32 q10, \ra, \rb - veor.u32 q8, q8, q12 - vshr.u32 q11, \ra, #20 - vshl.u32 q12, \ra, #32-20 - veor.u32 q8, q8, q11 - vand.u32 q9, q9, \rc - veor.u32 q8, q8, q12 - vadd.u32 \rh, \rh, q13 - veor.u32 q10, q10, q9 - vshr.u32 q11, q8, #2 - vshl.u32 q12, q8, #32-2 - vadd.u32 q9, \rh, q10 - vadd.u32 q12, q12, q11 - vadd.u32 \rh, \rh, \rd - vadd.u32 \rd, q9, q12 -.endm + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(4+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (4+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 -.macro sha256_4way_main_quadround i, rk, rw - sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7 - sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5 - sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4 -.endm + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(4+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (4+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(4+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (4+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 - .text - .code 32 - .align 2 - .globl sha256_transform_4way - .globl _sha256_transform_4way -#ifdef __ELF__ - .type sha256_transform_4way, %function -#endif -sha256_transform_4way: -_sha256_transform_4way: - stmfd sp!, {r4, lr} - vpush {q4-q7} - mov r12, sp - sub sp, sp, #64*16 - bic sp, sp, #63 - cmp r2, #0 - bne sha256_transform_4way_swap - - vldmia r1!, {q0-q7} - vstmia sp, {q0-q7} - add r3, sp, #8*16 - vldmia r1, {q8-q15} - vstmia r3, {q8-q15} - b sha256_transform_4way_extend + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(8+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (8+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 -sha256_transform_4way_swap: - vldmia r1!, {q0-q7} - vrev32.8 q0, q0 - vrev32.8 q1, q1 - vrev32.8 q2, q2 - vrev32.8 q3, q3 - vldmia r1, {q8-q15} - vrev32.8 q4, q4 - vrev32.8 q5, q5 - vrev32.8 q6, q6 - vrev32.8 q7, q7 - vstmia sp, {q0-q7} - vrev32.8 q8, q8 - vrev32.8 q9, q9 - vrev32.8 q10, q10 - vrev32.8 q11, q11 - vrev32.8 q12, q12 - vrev32.8 q13, q13 - vrev32.8 q14, q14 - vrev32.8 q15, q15 - add r3, sp, #8*16 - vstmia r3, {q8-q15} - -sha256_transform_4way_extend: - add r1, sp, #1*16 - add r2, sp, #16*16 - vmov.u32 q5, q0 - sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12 - - vldmia r0, {q0-q7} - adr r4, sha256_transform_4way_4k - b sha256_transform_4way_4k_over - .align 4 -sha256_transform_4way_4k: - sha256_4k -sha256_transform_4way_4k_over: - sha256_4way_main_quadround 0, r4, sp - sha256_4way_main_quadround 4, r4, sp - sha256_4way_main_quadround 8, r4, sp - sha256_4way_main_quadround 12, r4, sp - sha256_4way_main_quadround 16, r4, sp - sha256_4way_main_quadround 20, r4, sp - sha256_4way_main_quadround 24, r4, sp - sha256_4way_main_quadround 28, r4, sp - sha256_4way_main_quadround 32, r4, sp - sha256_4way_main_quadround 36, r4, sp - sha256_4way_main_quadround 40, r4, sp - sha256_4way_main_quadround 44, r4, sp - sha256_4way_main_quadround 48, r4, sp - sha256_4way_main_quadround 52, r4, sp - sha256_4way_main_quadround 56, r4, sp - sha256_4way_main_quadround 60, r4, sp - - vldmia r0, {q8-q15} - vadd.u32 q0, q0, q8 - vadd.u32 q1, q1, q9 - vadd.u32 q2, q2, q10 + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(8+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (8+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(8+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (8+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(8+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (8+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(12+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (12+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(12+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (12+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(12+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (12+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(12+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (12+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(16+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (16+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(16+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (16+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(16+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (16+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(16+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (16+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(20+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (20+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(20+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (20+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(20+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (20+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(20+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (20+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(24+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (24+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(24+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (24+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(24+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (24+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(24+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (24+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(28+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (28+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(28+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (28+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(28+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (28+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(28+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (28+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + b sha256_transform_k_over +sha256_transform_k: + .align 2 + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +sha256_transform_k_over: + ldr r12, [sp, #(32+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (32+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(32+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (32+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(32+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (32+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(32+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (32+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(36+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (36+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(36+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (36+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(36+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (36+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(36+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (36+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(40+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (40+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(40+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (40+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(40+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (40+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(40+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (40+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(44+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (44+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(44+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (44+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(44+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (44+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(44+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (44+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(48+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (48+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(48+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (48+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(48+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (48+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(48+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (48+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(52+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (52+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(52+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (52+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(52+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (52+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(52+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (52+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(56+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (56+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(56+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (56+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(56+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (56+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(56+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (56+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [sp, #(60+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (60+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [sp, #(60+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (60+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [sp, #(60+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (60+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [sp, #(60+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256_transform_k + (60+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + + ldmia r0, {r1, r2, r3, r12} + add r4, r4, r1 + add r5, r5, r2 + add r6, r6, r3 + add r7, r7, r12 + stmia r0!, {r4-r7} + ldmia r0, {r1, r2, r3, r12} + add r8, r8, r1 + add r9, r9, r2 + add r10, r10, r3 + add r11, r11, r12 + stmia r0, {r8-r11} + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + + .text + .code 32 + .align 2 + .globl sha256d_ms + .globl _sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #64*4 + + cmp r0, r0 + + ldr lr, [r1, #3*4] + ldr r6, [r1, #18*4] + ldr r7, [r1, #19*4] + + mov r12, lr, ror #7 + str r6, [sp, #18*4] + eor r12, r12, lr, ror #18 + str r7, [sp, #19*4] + eor r12, r12, lr, lsr #3 + ldr r8, [r1, #20*4] + add r6, r6, r12 + ldr r10, [r1, #22*4] + add r7, r7, lr + str r6, [r1, #18*4] + + mov r12, r6, ror #17 + str r7, [r1, #19*4] + eor r12, r12, r6, ror #19 + str r8, [sp, #20*4] + eor r12, r12, r6, lsr #10 + ldr r4, [r1, #23*4] + add r8, r8, r12 + ldr r5, [r1, #24*4] + + mov r9, r7, ror #17 + str r8, [r1, #20*4] + eor r9, r9, r7, ror #19 + str r10, [sp, #21*4] + eor r9, r9, r7, lsr #10 + str r4, [sp, #22*4] + + mov r12, r8, ror #17 + str r9, [r1, #21*4] + eor r12, r12, r8, ror #19 + str r5, [sp, #23*4] + eor r12, r12, r8, lsr #10 + mov lr, r9, ror #17 + add r10, r10, r12 + ldr r11, [r1, #30*4] + + eor lr, lr, r9, ror #19 + str r10, [r1, #22*4] + eor lr, lr, r9, lsr #10 + str r11, [sp, #24*4] + add r4, r4, lr + + mov r12, r10, ror #17 + str r4, [r1, #23*4] + eor r12, r12, r10, ror #19 + mov lr, r4, ror #17 + eor r12, r12, r10, lsr #10 + eor lr, lr, r4, ror #19 + add r5, r5, r12 + eor lr, lr, r4, lsr #10 + str r5, [r1, #24*4] + add r6, r6, lr + + mov r12, r5, ror #17 + str r6, [r1, #25*4] + eor r12, r12, r5, ror #19 + mov lr, r6, ror #17 + eor r12, r12, r5, lsr #10 + eor lr, lr, r6, ror #19 + add r7, r7, r12 + eor lr, lr, r6, lsr #10 + str r7, [r1, #26*4] + add r8, r8, lr + + mov r12, r7, ror #17 + str r8, [r1, #27*4] + eor r12, r12, r7, ror #19 + mov lr, r8, ror #17 + eor r12, r12, r7, lsr #10 + eor lr, lr, r8, ror #19 + add r9, r9, r12 + eor lr, lr, r8, lsr #10 + str r9, [r1, #28*4] + add r10, r10, lr + + ldr lr, [r1, #31*4] + mov r12, r9, ror #17 + str r10, [r1, #29*4] + eor r12, r12, r9, ror #19 + str lr, [sp, #25*4] + eor r12, r12, r9, lsr #10 + add r11, r11, r12 + add r5, r5, lr + mov r12, r10, ror #17 + add r4, r4, r11 + + ldr r11, [r1, #16*4] + eor r12, r12, r10, ror #19 + str r4, [r1, #30*4] + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + ldr lr, [r1, #17*4] + +sha256d_ms_extend_loop2: + str r5, [r1, #(16+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(16+2)*4] + eor r6, r6, lr, lsr #3 + add r6, r6, r12 + + mov r12, r5, ror #17 + str r6, [r1, #(16+16)*4] + add lr, lr, r7 + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + add r7, r7, lr + ldr lr, [r1, #(16+3)*4] + str r7, [r1, #(18+15)*4] + mov r12, r6, ror #17 + add r11, r11, r8 + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(18+2)*4] + eor r8, r8, lr, lsr #3 + add r8, r8, r12 + + mov r12, r7, ror #17 + str r8, [r1, #(18+16)*4] + add lr, lr, r9 + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + add r9, r9, lr + ldr lr, [r1, #(18+3)*4] + str r9, [r1, #(20+15)*4] + mov r12, r8, ror #17 + add r11, r11, r10 + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(20+2)*4] + eor r10, r10, lr, lsr #3 + add r10, r10, r12 + + mov r12, r9, ror #17 + str r10, [r1, #(20+16)*4] + add lr, lr, r4 + eor r12, r12, r9, ror #19 + mov r4, r11, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, r11, ror #18 + add lr, lr, r12 + eor r4, r4, r11, lsr #3 + add r4, r4, lr + ldr lr, [r1, #(20+3)*4] + str r4, [r1, #(22+15)*4] + mov r12, r10, ror #17 + add r11, r11, r5 + eor r12, r12, r10, ror #19 + mov r5, lr, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(22+2)*4] + eor r5, r5, lr, lsr #3 + add r5, r5, r12 + + mov r12, r4, ror #17 + str r5, [r1, #(22+16)*4] + add lr, lr, r6 + eor r12, r12, r4, ror #19 + mov r6, r11, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, r11, ror #18 + add lr, lr, r12 + eor r6, r6, r11, lsr #3 + add r6, r6, lr + ldr lr, [r1, #(22+3)*4] + str r6, [r1, #(24+15)*4] + mov r12, r5, ror #17 + add r11, r11, r7 + eor r12, r12, r5, ror #19 + mov r7, lr, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(24+2)*4] + eor r7, r7, lr, lsr #3 + add r7, r7, r12 + + mov r12, r6, ror #17 + str r7, [r1, #(24+16)*4] + add lr, lr, r8 + eor r12, r12, r6, ror #19 + mov r8, r11, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, r11, ror #18 + add lr, lr, r12 + eor r8, r8, r11, lsr #3 + add r8, r8, lr + ldr lr, [r1, #(24+3)*4] + str r8, [r1, #(26+15)*4] + mov r12, r7, ror #17 + add r11, r11, r9 + eor r12, r12, r7, ror #19 + mov r9, lr, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(26+2)*4] + eor r9, r9, lr, lsr #3 + add r9, r9, r12 + + mov r12, r8, ror #17 + str r9, [r1, #(26+16)*4] + add lr, lr, r10 + eor r12, r12, r8, ror #19 + mov r10, r11, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, r11, ror #18 + add lr, lr, r12 + eor r10, r10, r11, lsr #3 + add r10, r10, lr + ldr lr, [r1, #(26+3)*4] + str r10, [r1, #(28+15)*4] + mov r12, r9, ror #17 + add r11, r11, r4 + eor r12, r12, r9, ror #19 + mov r4, lr, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(28+2)*4] + eor r4, r4, lr, lsr #3 + add r4, r4, r12 + + mov r12, r10, ror #17 + str r4, [r1, #(28+16)*4] + add lr, lr, r5 + eor r12, r12, r10, ror #19 + mov r5, r11, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, r11, ror #18 + add lr, lr, r12 + eor r5, r5, r11, lsr #3 + add r5, r5, lr + ldr lr, [r1, #(28+3)*4] + str r5, [r1, #(30+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(30+2)*4] + eor r6, r6, lr, lsr #3 + add r6, r6, r12 + + mov r12, r5, ror #17 + str r6, [r1, #(30+16)*4] + add lr, lr, r7 + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + add r7, r7, lr + ldr lr, [r1, #(30+3)*4] + str r7, [r1, #(32+15)*4] + mov r12, r6, ror #17 + add r11, r11, r8 + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(32+2)*4] + eor r8, r8, lr, lsr #3 + add r8, r8, r12 + + mov r12, r7, ror #17 + str r8, [r1, #(32+16)*4] + add lr, lr, r9 + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + add r9, r9, lr + ldr lr, [r1, #(32+3)*4] + str r9, [r1, #(34+15)*4] + mov r12, r8, ror #17 + add r11, r11, r10 + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(34+2)*4] + eor r10, r10, lr, lsr #3 + add r10, r10, r12 + + mov r12, r9, ror #17 + str r10, [r1, #(34+16)*4] + add lr, lr, r4 + eor r12, r12, r9, ror #19 + mov r4, r11, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, r11, ror #18 + add lr, lr, r12 + eor r4, r4, r11, lsr #3 + add r4, r4, lr + ldr lr, [r1, #(34+3)*4] + str r4, [r1, #(36+15)*4] + mov r12, r10, ror #17 + add r11, r11, r5 + eor r12, r12, r10, ror #19 + mov r5, lr, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(36+2)*4] + eor r5, r5, lr, lsr #3 + add r5, r5, r12 + + mov r12, r4, ror #17 + str r5, [r1, #(36+16)*4] + add lr, lr, r6 + eor r12, r12, r4, ror #19 + mov r6, r11, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, r11, ror #18 + add lr, lr, r12 + eor r6, r6, r11, lsr #3 + add r6, r6, lr + ldr lr, [r1, #(36+3)*4] + str r6, [r1, #(38+15)*4] + mov r12, r5, ror #17 + add r11, r11, r7 + eor r12, r12, r5, ror #19 + mov r7, lr, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(38+2)*4] + eor r7, r7, lr, lsr #3 + add r7, r7, r12 + + mov r12, r6, ror #17 + str r7, [r1, #(38+16)*4] + add lr, lr, r8 + eor r12, r12, r6, ror #19 + mov r8, r11, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, r11, ror #18 + add lr, lr, r12 + eor r8, r8, r11, lsr #3 + add r8, r8, lr + ldr lr, [r1, #(38+3)*4] + str r8, [r1, #(40+15)*4] + mov r12, r7, ror #17 + add r11, r11, r9 + eor r12, r12, r7, ror #19 + mov r9, lr, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(40+2)*4] + eor r9, r9, lr, lsr #3 + add r9, r9, r12 + + mov r12, r8, ror #17 + str r9, [r1, #(40+16)*4] + add lr, lr, r10 + eor r12, r12, r8, ror #19 + mov r10, r11, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, r11, ror #18 + add lr, lr, r12 + eor r10, r10, r11, lsr #3 + add r10, r10, lr + ldr lr, [r1, #(40+3)*4] + str r10, [r1, #(42+15)*4] + mov r12, r9, ror #17 + add r11, r11, r4 + eor r12, r12, r9, ror #19 + mov r4, lr, ror #7 + eor r12, r12, r9, lsr #10 + eor r4, r4, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(42+2)*4] + eor r4, r4, lr, lsr #3 + add r4, r4, r12 + + mov r12, r10, ror #17 + str r4, [r1, #(42+16)*4] + add lr, lr, r5 + eor r12, r12, r10, ror #19 + mov r5, r11, ror #7 + eor r12, r12, r10, lsr #10 + eor r5, r5, r11, ror #18 + add lr, lr, r12 + eor r5, r5, r11, lsr #3 + add r5, r5, lr + ldr lr, [r1, #(42+3)*4] + bne sha256d_ms_extend_coda2 + str r5, [r1, #(44+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(44+2)*4] + eor r6, r6, lr, lsr #3 + add r6, r6, r12 + + mov r12, r5, ror #17 + str r6, [r1, #(44+16)*4] + add lr, lr, r7 + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + add r7, r7, lr + ldr lr, [r1, #(44+3)*4] + str r7, [r1, #(46+15)*4] + mov r12, r6, ror #17 + add r11, r11, r8 + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r12, r12, r11 + ldr r11, [r1, #(46+2)*4] + eor r8, r8, lr, lsr #3 + add r8, r8, r12 + + mov r12, r7, ror #17 + str r8, [r1, #(46+16)*4] + add lr, lr, r9 + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + add r9, r9, lr + str r9, [r1, #(46+17)*4] + + ldr r4, [r3, #0*4] + ldr r9, [r3, #1*4] + ldr r10, [r3, #2*4] + ldr r11, [r3, #3*4] + ldr r8, [r3, #4*4] + ldr r5, [r3, #5*4] + ldr r6, [r3, #6*4] + ldr r7, [r3, #7*4] + b sha256d_ms_main_loop1 + +sha256d_ms_main_loop2: + ldr r12, [r1, #(0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 +sha256d_ms_main_loop1: + ldr r12, [r1, #(3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(4+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (4+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(4+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (4+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(4+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (4+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(4+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (4+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(8+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (8+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(8+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (8+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(8+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (8+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(8+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (8+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(12+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (12+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(12+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (12+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(12+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (12+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(12+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (12+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(16+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (16+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(16+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (16+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(16+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (16+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(16+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (16+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(20+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (20+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(20+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (20+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(20+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (20+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(20+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (20+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(24+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (24+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(24+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (24+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(24+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (24+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(24+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (24+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(28+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (28+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(28+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (28+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(28+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (28+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(28+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (28+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + b sha256d_ms_k_over +sha256d_ms_k: + .align 2 + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +sha256d_ms_k_over: + ldr r12, [r1, #(32+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (32+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(32+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (32+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(32+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (32+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(32+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (32+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(36+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (36+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(36+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (36+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(36+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (36+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(36+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (36+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(40+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (40+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(40+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (40+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(40+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (40+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(40+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (40+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(44+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (44+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(44+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (44+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(44+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (44+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(44+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (44+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(48+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (48+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(48+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (48+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(48+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (48+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(48+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (48+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(52+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (52+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(52+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (52+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(52+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (52+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(52+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (52+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(56)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (56)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + bne sha256d_ms_finish + ldr r12, [r1, #(57)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (57)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(58)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (58)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(59)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (59)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + ldr r12, [r1, #(60+0)*4] + and r3, r9, r8 + bic lr, r10, r8 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (60+0)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + eor r3, r4, r5 + add r11, r11, lr, ror #6 + + and r3, r3, r6 + eor r12, r4, r4, ror #11 + and lr, r4, r5 + eor r12, r12, r4, ror #20 + eor lr, lr, r3 + add r3, r11, lr + add r11, r11, r7 + add r7, r3, r12, ror #2 + ldr r12, [r1, #(60+1)*4] + and r3, r8, r11 + bic lr, r9, r11 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (60+1)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + eor r3, r7, r4 + add r10, r10, lr, ror #6 + + and r3, r3, r5 + eor r12, r7, r7, ror #11 + and lr, r7, r4 + eor r12, r12, r7, ror #20 + eor lr, lr, r3 + add r3, r10, lr + add r10, r10, r6 + add r6, r3, r12, ror #2 + ldr r12, [r1, #(60+2)*4] + and r3, r11, r10 + bic lr, r8, r10 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (60+2)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + eor r3, r6, r7 + add r9, r9, lr, ror #6 + + and r3, r3, r4 + eor r12, r6, r6, ror #11 + and lr, r6, r7 + eor r12, r12, r6, ror #20 + eor lr, lr, r3 + add r3, r9, lr + add r9, r9, r5 + add r5, r3, r12, ror #2 + ldr r12, [r1, #(60+3)*4] + and r3, r10, r9 + bic lr, r11, r9 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (60+3)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + eor r3, r5, r6 + add r8, r8, lr, ror #6 + + and r3, r3, r7 + eor r12, r5, r5, ror #11 + and lr, r5, r6 + eor r12, r12, r5, ror #20 + eor lr, lr, r3 + add r3, r8, lr + add r8, r8, r4 + add r4, r3, r12, ror #2 + + ldmia r2!, {r3, r12, lr} + add r4, r4, r3 + add r5, r5, r12 + add r6, r6, lr + stmia sp, {r4-r6} + ldmia r2, {r3, r4, r5, r6, r12} + add lr, sp, #3*4 + add r7, r7, r3 + add r8, r8, r4 + add r9, r9, r5 + add r10, r10, r6 + add r11, r11, r12 + add r12, sp, #18*4 + stmia lr!, {r7-r11} + + ldmia r12, {r4-r11} + str r4, [r1, #18*4] + str r5, [r1, #19*4] + str r6, [r1, #20*4] + str r7, [r1, #22*4] + str r8, [r1, #23*4] + str r9, [r1, #24*4] + str r10, [r1, #30*4] + str r11, [r1, #31*4] + + mov r3, #0x80000000 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 + mov r10, #0x00000100 + stmia lr, {r3-r10} + + ldr lr, [sp, #1*4] + movs r1, sp + ldr r4, [sp, #0*4] + + ldr r11, [sp, #2*4] + mov r12, lr, ror #7 + eor r12, r12, lr, ror #18 + add r5, lr, #0x00a00000 + eor r12, r12, lr, lsr #3 + mov lr, r11, ror #7 + add r4, r4, r12 + eor lr, lr, r11, ror #18 + str r4, [sp, #16*4] + eor lr, lr, r11, lsr #3 + mov r12, r4, ror #17 + add r5, r5, lr + ldr lr, [sp, #3*4] + + str r5, [sp, #17*4] + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r11, r11, r12 + eor r6, r6, lr, lsr #3 + mov r12, r5, ror #17 + add r6, r6, r11 + ldr r11, [sp, #4*4] + + str r6, [sp, #18*4] + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + mov r12, r6, ror #17 + add r7, r7, lr + ldr lr, [sp, #5*4] + + str r7, [sp, #19*4] + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r11, r11, r12 + eor r8, r8, lr, lsr #3 + mov r12, r7, ror #17 + add r8, r8, r11 + ldr r11, [sp, #6*4] + + str r8, [sp, #20*4] + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + mov r12, r8, ror #17 + add r9, r9, lr + ldr lr, [sp, #7*4] + + str r9, [sp, #21*4] + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r11, r11, r12 + eor r10, r10, lr, lsr #3 + mov r12, r9, ror #17 + add r11, r11, #0x00000100 + add lr, lr, r4 + add r10, r10, r11 + + eor r12, r12, r9, ror #19 + str r10, [sp, #22*4] + add lr, lr, #0x11000000 + eor r12, r12, r9, lsr #10 + add lr, lr, r12 + mov r12, r10, ror #17 + add r4, lr, #0x00002000 + eor r12, r12, r10, ror #19 + str r4, [sp, #23*4] + add r5, r5, #0x80000000 + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + + mov r12, r4, ror #17 + str r5, [sp, #24*4] + eor r12, r12, r4, ror #19 + mov r11, r5, ror #17 + eor r12, r12, r4, lsr #10 + eor r11, r11, r5, ror #19 + add r6, r6, r12 + eor r11, r11, r5, lsr #10 + str r6, [sp, #25*4] + add r7, r7, r11 + + mov r12, r6, ror #17 + str r7, [sp, #26*4] + eor r12, r12, r6, ror #19 + mov r11, r7, ror #17 + eor r12, r12, r6, lsr #10 + eor r11, r11, r7, ror #19 + add r8, r8, r12 + eor r11, r11, r7, lsr #10 + str r8, [sp, #27*4] + add r9, r9, r11 + + mov lr, r8, ror #17 + mov r12, r9, ror #17 + str r9, [sp, #28*4] + add r4, r4, #0x00400000 + eor lr, lr, r8, ror #19 + eor r12, r12, r9, ror #19 + eor lr, lr, r8, lsr #10 + eor r12, r12, r9, lsr #10 + add r4, r4, #0x00000022 + add r10, r10, lr + add r4, r4, r12 + ldr r11, [sp, #16*4] + + add r5, r5, #0x00000100 + str r4, [sp, #30*4] + mov lr, r11, ror #7 + str r10, [sp, #29*4] + mov r12, r10, ror #17 + eor lr, lr, r11, ror #18 + eor r12, r12, r10, ror #19 + eor lr, lr, r11, lsr #3 + eor r12, r12, r10, lsr #10 + add r5, r5, lr + ldr lr, [r1, #17*4] + add r5, r5, r12 + + b sha256d_ms_extend_loop2 + +sha256d_ms_extend_coda2: + str r5, [r1, #(44+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + mov r6, lr, ror #7 + eor r12, r12, r4, ror #19 + eor r6, r6, lr, ror #18 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, lsr #3 + add r12, r12, r11 + add r6, r6, r12 + str r6, [r1, #(44+16)*4] + + adr r2, sha256d_ms_h + ldmia r2, {r4-r11} + b sha256d_ms_main_loop2 + +sha256d_ms_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + + +sha256d_ms_finish: + ldr r12, [r1, #(57)*4] + and r3, r8, r11 + bic lr, r9, r11 + add r10, r10, r6 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (57)*4 + add r10, r10, lr + eor lr, r11, r11, ror #5 + add r10, r10, r12 + eor lr, lr, r11, ror #19 + add r10, r10, r3 + add r10, r10, lr, ror #6 + ldr r12, [r1, #(58)*4] + and r3, r11, r10 + bic lr, r8, r10 + add r9, r9, r5 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (58)*4 + add r9, r9, lr + eor lr, r10, r10, ror #5 + add r9, r9, r12 + eor lr, lr, r10, ror #19 + add r9, r9, r3 + add r9, r9, lr, ror #6 + ldr r12, [r1, #(59)*4] + and r3, r10, r9 + bic lr, r11, r9 + add r8, r8, r4 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (59)*4 + add r8, r8, lr + eor lr, r9, r9, ror #5 + add r8, r8, r12 + eor lr, lr, r9, ror #19 + add r8, r8, r3 + add r8, r8, lr, ror #6 + ldr r5, [r2, #7*4] + ldr r12, [r1, #(60)*4] + and r3, r9, r8 + bic lr, r10, r8 + add r11, r11, r7 + orr lr, lr, r3 + ldr r3, sha256d_ms_k + (60)*4 + add r11, r11, lr + eor lr, r8, r8, ror #5 + add r11, r11, r12 + eor lr, lr, r8, ror #19 + add r11, r11, r3 + add r11, r11, lr, ror #6 + + add r11, r11, r5 + str r11, [r0, #7*4] + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + +#ifdef __ARM_NEON__ + + .text + .code 32 + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: + adr r12, sha256_4h + vldmia r12, {q8-q15} + vstmia r0, {q8-q15} + bx lr + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + + + + + + + + + .text + .code 32 + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + cmp r2, #0 + bne sha256_transform_4way_swap + + vldmia r1!, {q0-q7} + vstmia sp, {q0-q7} + add r3, sp, #8*16 + vldmia r1, {q8-q15} + vstmia r3, {q8-q15} + b sha256_transform_4way_extend + +sha256_transform_4way_swap: + vldmia r1!, {q0-q7} + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 + vldmia r1, {q8-q15} + vrev32.8 q4, q4 + vrev32.8 q5, q5 + vrev32.8 q6, q6 + vrev32.8 q7, q7 + vstmia sp, {q0-q7} + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 + vrev32.8 q12, q12 + vrev32.8 q13, q13 + vrev32.8 q14, q14 + vrev32.8 q15, q15 + add r3, sp, #8*16 + vstmia r3, {q8-q15} + +sha256_transform_4way_extend: + add r1, sp, #1*16 + add r2, sp, #16*16 + vmov.u32 q5, q0 + vld1.u32 {q6}, [r1]! + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q5, q5, q9 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshr.u32 q1, q14, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q9, q9, q1 + vadd.u32 q4, q4, q5 + veor.u32 q9, q9, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q9, q9, q4 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q10, q10, q1 + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r2]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r2]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q14, q14, q1 + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vst1.u32 {q14}, [r2]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q15}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q14, #10 + veor.u32 q9, q9, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q9, q9, q1 + vadd.u32 q1, q6, q4 + veor.u32 q9, q9, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q9, q9, q1 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vst1.u32 {q9}, [r2]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshr.u32 q1, q15, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q10, q10, q1 + vadd.u32 q4, q4, q5 + veor.u32 q10, q10, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q10, q10, q4 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q6, q6, q11 + vst1.u32 {q10}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q9, #10 + veor.u32 q11, q11, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q11, q11, q1 + vadd.u32 q1, q6, q4 + veor.u32 q11, q11, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q11, q11, q1 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q11}, [r2]! + vadd.u32 q5, q5, q12 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshr.u32 q1, q10, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q12, q12, q1 + vadd.u32 q4, q4, q5 + veor.u32 q12, q12, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q12, q12, q4 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q6, q6, q13 + vst1.u32 {q12}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q11, #10 + veor.u32 q13, q13, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q13, q13, q1 + vadd.u32 q1, q6, q4 + veor.u32 q13, q13, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q13, q13, q1 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q13}, [r2]! + vadd.u32 q5, q5, q14 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshr.u32 q1, q12, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q14, q14, q1 + vadd.u32 q4, q4, q5 + veor.u32 q14, q14, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q14, q14, q4 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q6, q6, q15 + vst1.u32 {q14}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q13, #10 + veor.u32 q15, q15, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q15, q15, q1 + vadd.u32 q1, q6, q4 + veor.u32 q15, q15, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q15, q15, q1 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vst1.u32 {q15}, [r2]! + vadd.u32 q5, q5, q9 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshr.u32 q1, q14, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q9, q9, q1 + vadd.u32 q4, q4, q5 + veor.u32 q9, q9, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q9, q9, q4 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q10, q10, q1 + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r2]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r2]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q14, q14, q1 + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vst1.u32 {q14}, [r2]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q15}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q14, #10 + veor.u32 q9, q9, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q9, q9, q1 + vadd.u32 q1, q6, q4 + veor.u32 q9, q9, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q9, q9, q1 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vst1.u32 {q9}, [r2]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshr.u32 q1, q15, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q10, q10, q1 + vadd.u32 q4, q4, q5 + veor.u32 q10, q10, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q10, q10, q4 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q6, q6, q11 + vst1.u32 {q10}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q9, #10 + veor.u32 q11, q11, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q11, q11, q1 + vadd.u32 q1, q6, q4 + veor.u32 q11, q11, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q11, q11, q1 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q11}, [r2]! + vadd.u32 q5, q5, q12 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshr.u32 q1, q10, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q12, q12, q1 + vadd.u32 q4, q4, q5 + veor.u32 q12, q12, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q12, q12, q4 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q6, q6, q13 + vst1.u32 {q12}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q11, #10 + veor.u32 q13, q13, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q13, q13, q1 + vadd.u32 q1, q6, q4 + veor.u32 q13, q13, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q13, q13, q1 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q13}, [r2]! + vadd.u32 q5, q5, q14 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshr.u32 q1, q12, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q14, q14, q1 + vadd.u32 q4, q4, q5 + veor.u32 q14, q14, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q14, q14, q4 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q6, q6, q15 + vst1.u32 {q14}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q13, #10 + veor.u32 q15, q15, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q15, q15, q1 + vadd.u32 q1, q6, q4 + veor.u32 q15, q15, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q15, q15, q1 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vst1.u32 {q15}, [r2]! + vadd.u32 q5, q5, q9 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshr.u32 q1, q14, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q9, q9, q1 + vadd.u32 q4, q4, q5 + veor.u32 q9, q9, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q9, q9, q4 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q10, q10, q1 + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r2]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r2]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q14, q14, q1 + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vst1.u32 {q14}, [r2]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q15}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q14, #10 + veor.u32 q9, q9, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q9, q9, q1 + vadd.u32 q1, q6, q4 + veor.u32 q9, q9, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q9, q9, q1 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vst1.u32 {q9}, [r2]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshr.u32 q1, q15, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q10, q10, q1 + vadd.u32 q4, q4, q5 + veor.u32 q10, q10, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q10, q10, q4 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q6, q6, q11 + vst1.u32 {q10}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q9, #10 + veor.u32 q11, q11, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q11, q11, q1 + vadd.u32 q1, q6, q4 + veor.u32 q11, q11, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q11, q11, q1 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q11}, [r2]! + vadd.u32 q5, q5, q12 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshr.u32 q1, q10, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q12, q12, q1 + vadd.u32 q4, q4, q5 + veor.u32 q12, q12, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q12, q12, q4 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q6, q6, q13 + vst1.u32 {q12}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q11, #10 + veor.u32 q13, q13, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q13, q13, q1 + vadd.u32 q1, q6, q4 + veor.u32 q13, q13, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q13, q13, q1 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q13}, [r2]! + vadd.u32 q5, q5, q14 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshr.u32 q1, q12, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q14, q14, q1 + vadd.u32 q4, q4, q5 + veor.u32 q14, q14, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q14, q14, q4 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q6, q6, q15 + vst1.u32 {q14}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q13, #10 + veor.u32 q15, q15, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q15, q15, q1 + vadd.u32 q1, q6, q4 + veor.u32 q15, q15, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q15, q15, q1 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vst1.u32 {q15}, [r2]! + vadd.u32 q5, q5, q9 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshr.u32 q1, q14, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q9, q9, q1 + vadd.u32 q4, q4, q5 + veor.u32 q9, q9, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q9, q9, q4 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q10, q10, q1 + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r2]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r1]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r2]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r1]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r2]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vadd.u32 q14, q14, q1 + vst1.u32 {q14}, [r2]! + + vldmia r0, {q0-q7} + adr r4, sha256_transform_4way_4k + b sha256_transform_4way_4k_over + .align 4 +sha256_transform_4way_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 +sha256_transform_4way_4k_over: + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [sp]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r4]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + + vldmia r0, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 vadd.u32 q3, q3, q11 vadd.u32 q4, q4, q12 vadd.u32 q5, q5, q13 - vadd.u32 q6, q6, q14 - vadd.u32 q7, q7, q15 - vstmia r0, {q0-q7} - - mov sp, r12 - vpop {q4-q7} - ldmfd sp!, {r4, pc} - + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + vstmia r0, {q0-q7} + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + + add r4, r1, #3*16 + vld1.u32 {q6}, [r4]! + add r1, r1, #18*16 + vldmia r1, {q11-q13} + cmp r0, r0 + + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + vshr.u32 q1, q6, #18 + veor.u32 q10, q10, q0 + vshl.u32 q0, q6, #32-18 + veor.u32 q10, q10, q1 + vshr.u32 q1, q6, #3 + veor.u32 q10, q10, q0 + vstmia sp!, {q11-q13} + veor.u32 q4, q10, q1 + vadd.u32 q12, q12, q6 + vadd.u32 q11, q11, q4 + + vshr.u32 q14, q12, #17 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + vshl.u32 q0, q12, #32-17 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vadd.u32 q13, q13, q4 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q14, q14, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q1, q12, #10 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + veor.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + vld1.u32 {q15}, [r1] + veor.u32 q4, q4, q1 + vst1.u32 {q15}, [sp]! + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q9}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vst1.u32 {q9}, [sp]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q15, #17 + vadd.u32 q9, q9, q5 + vshl.u32 q0, q15, #32-17 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q10}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vst1.u32 {q10}, [sp]! + veor.u32 q4, q4, q1 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q4 + vshr.u32 q4, q9, #17 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q9, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q10}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q2 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q2 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q5, q4, q1 + add r4, r4, #12*16 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q15, q15, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vld1.u32 {q2}, [r1] + veor.u32 q4, q4, q1 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q9, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q9, q9, q2 + vshr.u32 q4, q15, #17 + vshr.u32 q2, q15, #19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q2 + vshr.u32 q0, q15, #10 + veor.u32 q4, q4, q1 + vld1.u32 {q5-q6}, [r4]! + veor.u32 q4, q4, q0 + vld1.u32 {q2}, [r1] + vadd.u32 q10, q10, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q10, q10, q2 + + sub sp, sp, #8*16 + +sha256d_ms_4way_extend_loop2: + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r1]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r1]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q14, q14, q1 + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vst1.u32 {q14}, [r1]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q14, #10 + veor.u32 q9, q9, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q9, q9, q1 + vadd.u32 q1, q6, q4 + veor.u32 q9, q9, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q9, q9, q1 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vst1.u32 {q9}, [r1]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshr.u32 q1, q15, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q10, q10, q1 + vadd.u32 q4, q4, q5 + veor.u32 q10, q10, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q10, q10, q4 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q6, q6, q11 + vst1.u32 {q10}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q9, #10 + veor.u32 q11, q11, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q11, q11, q1 + vadd.u32 q1, q6, q4 + veor.u32 q11, q11, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q11, q11, q1 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q11}, [r1]! + vadd.u32 q5, q5, q12 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshr.u32 q1, q10, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q12, q12, q1 + vadd.u32 q4, q4, q5 + veor.u32 q12, q12, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q12, q12, q4 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q6, q6, q13 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q11, #10 + veor.u32 q13, q13, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q13, q13, q1 + vadd.u32 q1, q6, q4 + veor.u32 q13, q13, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q13, q13, q1 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q13}, [r1]! + vadd.u32 q5, q5, q14 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshr.u32 q1, q12, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q14, q14, q1 + vadd.u32 q4, q4, q5 + veor.u32 q14, q14, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q14, q14, q4 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q6, q6, q15 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q13, #10 + veor.u32 q15, q15, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q15, q15, q1 + vadd.u32 q1, q6, q4 + veor.u32 q15, q15, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q15, q15, q1 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vst1.u32 {q15}, [r1]! + vadd.u32 q5, q5, q9 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshr.u32 q1, q14, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q9, q9, q1 + vadd.u32 q4, q4, q5 + veor.u32 q9, q9, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q9, q9, q4 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q10, q10, q1 + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r1]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r1]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q14, q14, q1 + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vst1.u32 {q14}, [r1]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q14, #10 + veor.u32 q9, q9, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q9, q9, q1 + vadd.u32 q1, q6, q4 + veor.u32 q9, q9, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q9, q9, q1 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vst1.u32 {q9}, [r1]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshr.u32 q1, q15, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q10, q10, q1 + vadd.u32 q4, q4, q5 + veor.u32 q10, q10, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q10, q10, q4 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q6, q6, q11 + vst1.u32 {q10}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q9, #10 + veor.u32 q11, q11, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q11, q11, q1 + vadd.u32 q1, q6, q4 + veor.u32 q11, q11, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q11, q11, q1 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q11}, [r1]! + vadd.u32 q5, q5, q12 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshr.u32 q1, q10, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q12, q12, q1 + vadd.u32 q4, q4, q5 + veor.u32 q12, q12, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q12, q12, q4 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q6, q6, q13 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q11, #10 + veor.u32 q13, q13, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q13, q13, q1 + vadd.u32 q1, q6, q4 + veor.u32 q13, q13, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q13, q13, q1 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q13}, [r1]! + vadd.u32 q5, q5, q14 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshr.u32 q1, q12, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q14, q14, q1 + vadd.u32 q4, q4, q5 + veor.u32 q14, q14, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q14, q14, q4 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q6, q6, q15 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q13, #10 + veor.u32 q15, q15, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q15, q15, q1 + vadd.u32 q1, q6, q4 + veor.u32 q15, q15, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q15, q15, q1 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vst1.u32 {q15}, [r1]! + vadd.u32 q5, q5, q9 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q9, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshr.u32 q1, q14, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q9, q9, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q9, q9, q1 + vadd.u32 q4, q4, q5 + veor.u32 q9, q9, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q9, q9, q4 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q10, q10, q1 + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vst1.u32 {q10}, [r1]! + vadd.u32 q5, q5, q11 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q11, q11, q4 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q6, q6, q12 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q12, q12, q1 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q12, q12, q1 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q12}, [r1]! + vadd.u32 q5, q5, q13 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q13, q13, q4 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q6, q6, q14 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q14, q14, q1 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + vadd.u32 q14, q14, q1 + vst1.u32 {q14}, [r1]! + bne sha256d_ms_4way_extend_coda2 + + vldmia r3!, {q4-q7} + vldmia r3, {q0-q3} + vswp q0, q4 + adr r3, sha256d_ms_4way_4k+3*16 + sub r1, r1, #(64-3)*16 + b sha256d_ms_4way_main_loop1 + + .align 4 +sha256d_ms_4way_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + +sha256d_ms_4way_main_loop2: + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 +sha256d_ms_4way_main_loop1: + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 - .text - .code 32 - .align 2 - .globl sha256d_ms_4way - .globl _sha256d_ms_4way -#ifdef __ELF__ - .type sha256d_ms_4way, %function -#endif -sha256d_ms_4way: -_sha256d_ms_4way: - stmfd sp!, {r4, lr} - vpush {q4-q7} - mov r12, sp - sub sp, sp, #64*16 - bic sp, sp, #63 - - add r4, r1, #3*16 - vld1.u32 {q6}, [r4]! - add r1, r1, #18*16 - vldmia r1, {q11-q13} - cmp r0, r0 - - vshr.u32 q10, q6, #7 - vshl.u32 q0, q6, #32-7 - vshr.u32 q1, q6, #18 - veor.u32 q10, q10, q0 - vshl.u32 q0, q6, #32-18 - veor.u32 q10, q10, q1 - vshr.u32 q1, q6, #3 - veor.u32 q10, q10, q0 - vstmia sp!, {q11-q13} - veor.u32 q4, q10, q1 - vadd.u32 q12, q12, q6 - vadd.u32 q11, q11, q4 - - vshr.u32 q14, q12, #17 - vshr.u32 q4, q11, #17 - vshl.u32 q0, q11, #32-17 - vst1.u32 {q11}, [r1]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #19 - vshl.u32 q1, q11, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q12}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q11, #10 - vshl.u32 q0, q12, #32-17 - veor.u32 q4, q4, q1 - veor.u32 q14, q14, q0 - vadd.u32 q13, q13, q4 - vshr.u32 q0, q12, #19 - vshl.u32 q1, q12, #32-19 - veor.u32 q14, q14, q0 - vst1.u32 {q13}, [r1]! - veor.u32 q14, q14, q1 - vshr.u32 q1, q12, #10 - - vshr.u32 q4, q13, #17 - vshl.u32 q0, q13, #32-17 - veor.u32 q14, q14, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #19 - vshl.u32 q1, q13, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q14}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q13, #10 - vld1.u32 {q15}, [r1] - veor.u32 q4, q4, q1 - vst1.u32 {q15}, [sp]! - vadd.u32 q15, q15, q4 - vshr.u32 q4, q14, #17 - vshl.u32 q0, q14, #32-17 - vshl.u32 q1, q14, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q14, #19 - vst1.u32 {q15}, [r1]! - veor.u32 q4, q4, q0 - vld1.u32 {q9}, [r1] - veor.u32 q4, q4, q1 - vshr.u32 q1, q14, #10 - vst1.u32 {q9}, [sp]! - veor.u32 q5, q4, q1 - - vshr.u32 q4, q15, #17 - vadd.u32 q9, q9, q5 - vshl.u32 q0, q15, #32-17 - vshl.u32 q1, q15, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q15, #19 - vst1.u32 {q9}, [r1]! - veor.u32 q4, q4, q0 - vld1.u32 {q10}, [r1] - veor.u32 q4, q4, q1 - vshr.u32 q1, q15, #10 - vst1.u32 {q10}, [sp]! - veor.u32 q4, q4, q1 - vshl.u32 q0, q9, #32-17 - vadd.u32 q10, q10, q4 - vshr.u32 q4, q9, #17 - vshl.u32 q1, q9, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q9, #19 - veor.u32 q4, q4, q1 - vshr.u32 q1, q9, #10 - veor.u32 q4, q4, q0 - vst1.u32 {q10}, [r1]! - veor.u32 q5, q4, q1 - - vshr.u32 q4, q10, #17 - vshl.u32 q0, q10, #32-17 - vadd.u32 q11, q11, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q10, #19 - vshl.u32 q1, q10, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q11}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q10, #10 - vshl.u32 q0, q11, #32-17 - veor.u32 q2, q4, q1 - vshr.u32 q4, q11, #17 - vadd.u32 q12, q12, q2 - vshl.u32 q1, q11, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #19 - veor.u32 q4, q4, q1 - vshr.u32 q1, q11, #10 - veor.u32 q4, q4, q0 - vst1.u32 {q12}, [r1]! - veor.u32 q5, q4, q1 - - vshr.u32 q4, q12, #17 - vshl.u32 q0, q12, #32-17 - vadd.u32 q13, q13, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q12, #19 - vshl.u32 q1, q12, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q13}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q12, #10 - vshl.u32 q0, q13, #32-17 - veor.u32 q2, q4, q1 - vshr.u32 q4, q13, #17 - vadd.u32 q14, q14, q2 - vshl.u32 q1, q13, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #19 - veor.u32 q4, q4, q1 - vshr.u32 q1, q13, #10 - veor.u32 q4, q4, q0 - vst1.u32 {q14}, [r1]! - veor.u32 q5, q4, q1 - add r4, r4, #12*16 - - vshr.u32 q4, q14, #17 - vshl.u32 q0, q14, #32-17 - vadd.u32 q15, q15, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q14, #19 - vshl.u32 q1, q14, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q15}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q14, #10 - vld1.u32 {q2}, [r1] - veor.u32 q4, q4, q1 - vshl.u32 q0, q15, #32-17 - vadd.u32 q9, q9, q4 - vst1.u32 {q2}, [sp]! - vadd.u32 q9, q9, q2 - vshr.u32 q4, q15, #17 - vshr.u32 q2, q15, #19 - veor.u32 q4, q4, q0 - vst1.u32 {q9}, [r1]! - vshl.u32 q1, q15, #32-19 - veor.u32 q4, q4, q2 - vshr.u32 q0, q15, #10 - veor.u32 q4, q4, q1 - vld1.u32 {q5-q6}, [r4]! - veor.u32 q4, q4, q0 - vld1.u32 {q2}, [r1] - vadd.u32 q10, q10, q4 - vst1.u32 {q2}, [sp]! - vadd.u32 q10, q10, q2 - - sub sp, sp, #8*16 - -sha256d_ms_4way_extend_loop2: - sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12 - bne sha256d_ms_4way_extend_coda2 - - vldmia r3!, {q4-q7} - vldmia r3, {q0-q3} - vswp q0, q4 - adr r3, sha256d_ms_4way_4k+3*16 - sub r1, r1, #(64-3)*16 - b sha256d_ms_4way_main_loop1 - - .align 4 -sha256d_ms_4way_4k: - sha256_4k - -sha256d_ms_4way_main_loop2: - sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 - sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 -sha256d_ms_4way_main_loop1: - sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 - sha256_4way_main_quadround 4, r3, r1 - sha256_4way_main_quadround 8, r3, r1 - sha256_4way_main_quadround 12, r3, r1 - sha256_4way_main_quadround 16, r3, r1 - sha256_4way_main_quadround 20, r3, r1 - sha256_4way_main_quadround 24, r3, r1 - sha256_4way_main_quadround 28, r3, r1 - sha256_4way_main_quadround 32, r3, r1 - sha256_4way_main_quadround 36, r3, r1 - sha256_4way_main_quadround 40, r3, r1 - sha256_4way_main_quadround 44, r3, r1 - sha256_4way_main_quadround 48, r3, r1 - sha256_4way_main_quadround 52, r3, r1 - sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 bne sha256d_ms_4way_finish - sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 - sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 - sha256_4way_main_quadround 60, r3, r1 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q10 + vshl.u32 q12, q4, #32-5 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vadd.u32 q7, q7, q9 + veor.u32 q9, q0, q1 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + + vshr.u32 q11, q0, #11 + vshl.u32 q12, q0, #32-11 + veor.u32 q8, q0, q11 + vand.u32 q10, q0, q1 + veor.u32 q8, q8, q12 + vshr.u32 q11, q0, #20 + vshl.u32 q12, q0, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q2 + veor.u32 q8, q8, q12 + vadd.u32 q7, q7, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q7, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q7, q7, q3 + vadd.u32 q3, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q10 + vshl.u32 q12, q7, #32-5 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q7, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q6, q6, q8 + veor.u32 q10, q10, q12 + vadd.u32 q6, q6, q9 + veor.u32 q9, q3, q0 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q6, q6, q11 + + vshr.u32 q11, q3, #11 + vshl.u32 q12, q3, #32-11 + veor.u32 q8, q3, q11 + vand.u32 q10, q3, q0 + veor.u32 q8, q8, q12 + vshr.u32 q11, q3, #20 + vshl.u32 q12, q3, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q1 + veor.u32 q8, q8, q12 + vadd.u32 q6, q6, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q6, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q6, q6, q2 + vadd.u32 q2, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q10 + vshl.u32 q12, q6, #32-5 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vadd.u32 q5, q5, q9 + veor.u32 q9, q2, q3 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + + vshr.u32 q11, q2, #11 + vshl.u32 q12, q2, #32-11 + veor.u32 q8, q2, q11 + vand.u32 q10, q2, q3 + veor.u32 q8, q8, q12 + vshr.u32 q11, q2, #20 + vshl.u32 q12, q2, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q0 + veor.u32 q8, q8, q12 + vadd.u32 q5, q5, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q5, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q5, q5, q1 + vadd.u32 q1, q9, q12 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q10 + vshl.u32 q12, q5, #32-5 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vadd.u32 q4, q4, q9 + veor.u32 q9, q1, q2 + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + + vshr.u32 q11, q1, #11 + vshl.u32 q12, q1, #32-11 + veor.u32 q8, q1, q11 + vand.u32 q10, q1, q2 + veor.u32 q8, q8, q12 + vshr.u32 q11, q1, #20 + vshl.u32 q12, q1, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, q3 + veor.u32 q8, q8, q12 + vadd.u32 q4, q4, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, q4, q10 + vadd.u32 q12, q12, q11 + vadd.u32 q4, q4, q0 + vadd.u32 q0, q9, q12 vldmia r2, {q8-q15} vadd.u32 q0, q0, q8 @@ -1526,35 +11813,92 @@ sha256d_ms_4way_extend_coda2: sub r3, r3, #64*16 b sha256d_ms_4way_main_loop2 -.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh - vld1.u32 {q8}, [\rw]! - vand.u32 q9, \rf, \re - vbic.u32 q10, \rg, \re - vshr.u32 q11, \re, #5 + +sha256d_ms_4way_finish: + vld1.u32 {q8}, [r1]! + vand.u32 q9, q4, q7 + vbic.u32 q10, q5, q7 + vshr.u32 q11, q7, #5 vorr.u32 q10, q10, q9 - vshl.u32 q12, \re, #32-5 - vadd.u32 \rh, \rh, q10 - veor.u32 q10, \re, q11 - vshr.u32 q11, \re, #19 + vshl.u32 q12, q7, #32-5 + vadd.u32 q6, q6, q10 + veor.u32 q10, q7, q11 + vshr.u32 q11, q7, #19 veor.u32 q10, q10, q12 - vshl.u32 q12, \re, #32-19 + vshl.u32 q12, q7, #32-19 veor.u32 q10, q10, q11 - vadd.u32 \rh, \rh, q8 + vadd.u32 q6, q6, q8 veor.u32 q10, q10, q12 - vld1.u32 {q9}, [\rk]! - vadd.u32 \rh, \rh, \rd + vld1.u32 {q9}, [r3]! + vadd.u32 q6, q6, q2 vshr.u32 q11, q10, #6 - vadd.u32 \rh, \rh, q9 + vadd.u32 q6, q6, q9 vshl.u32 q13, q10, #32-6 - vadd.u32 \rh, \rh, q11 - vadd.u32 \rh, \rh, q13 -.endm - -sha256d_ms_4way_finish: - sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6 - sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5 - sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4 - sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7 + vadd.u32 q6, q6, q11 + vadd.u32 q6, q6, q13 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q7, q6 + vbic.u32 q10, q4, q6 + vshr.u32 q11, q6, #5 + vorr.u32 q10, q10, q9 + vshl.u32 q12, q6, #32-5 + vadd.u32 q5, q5, q10 + veor.u32 q10, q6, q11 + vshr.u32 q11, q6, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q6, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q5, q5, q8 + veor.u32 q10, q10, q12 + vld1.u32 {q9}, [r3]! + vadd.u32 q5, q5, q1 + vshr.u32 q11, q10, #6 + vadd.u32 q5, q5, q9 + vshl.u32 q13, q10, #32-6 + vadd.u32 q5, q5, q11 + vadd.u32 q5, q5, q13 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q6, q5 + vbic.u32 q10, q7, q5 + vshr.u32 q11, q5, #5 + vorr.u32 q10, q10, q9 + vshl.u32 q12, q5, #32-5 + vadd.u32 q4, q4, q10 + veor.u32 q10, q5, q11 + vshr.u32 q11, q5, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q5, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q4, q4, q8 + veor.u32 q10, q10, q12 + vld1.u32 {q9}, [r3]! + vadd.u32 q4, q4, q0 + vshr.u32 q11, q10, #6 + vadd.u32 q4, q4, q9 + vshl.u32 q13, q10, #32-6 + vadd.u32 q4, q4, q11 + vadd.u32 q4, q4, q13 + vld1.u32 {q8}, [r1]! + vand.u32 q9, q5, q4 + vbic.u32 q10, q6, q4 + vshr.u32 q11, q4, #5 + vorr.u32 q10, q10, q9 + vshl.u32 q12, q4, #32-5 + vadd.u32 q7, q7, q10 + veor.u32 q10, q4, q11 + vshr.u32 q11, q4, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, q4, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 q7, q7, q8 + veor.u32 q10, q10, q12 + vld1.u32 {q9}, [r3]! + vadd.u32 q7, q7, q3 + vshr.u32 q11, q10, #6 + vadd.u32 q7, q7, q9 + vshl.u32 q13, q10, #32-6 + vadd.u32 q7, q7, q11 + vadd.u32 q7, q7, q13 vadd.u32 q7, q7, q15 add r0, r0, #7*16 diff --git a/sha2-arm.S.orig b/sha2-arm.S.orig new file mode 100644 index 000000000..bd7fdc5cb --- /dev/null +++ b/sha2-arm.S.orig @@ -0,0 +1,1583 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) + +.macro sha256_k + .align 2 + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.endm + +.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz + mov r12, \ry, ror #17 + add r11, r11, \ra + eor r12, r12, \ry, ror #19 + mov \ra, lr, ror #7 + eor r12, r12, \ry, lsr #10 + eor \ra, \ra, lr, ror #18 + add r12, r12, r11 + ldr r11, [\rw, #(\i+2)*4] + eor \ra, \ra, lr, lsr #3 + add \ra, \ra, r12 + + mov r12, \rz, ror #17 + str \ra, [\rw, #(\i+16)*4] + add lr, lr, \rb + eor r12, r12, \rz, ror #19 + mov \rb, r11, ror #7 + eor r12, r12, \rz, lsr #10 + eor \rb, \rb, r11, ror #18 + add lr, lr, r12 + eor \rb, \rb, r11, lsr #3 + add \rb, \rb, lr +.endm + +.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz + ldr lr, [\rw, #(\i+1)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + ldr lr, [\rw, #(\i+3)*4] +.endm + +.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz + str \rz, [\rw, #(\i+15)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + ldr lr, [\rw, #(\i+3)*4] +.endm + +.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz + str \rz, [\rw, #(\i+15)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + str \rb, [\rw, #(\i+17)*4] +.endm + +.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh + ldr r12, [\rw, #(\i)*4] + and r3, \rf, \re + bic lr, \rg, \re + orr lr, lr, r3 + ldr r3, \ka + (\i)*4 + add \rh, \rh, lr + eor lr, \re, \re, ror #5 + add \rh, \rh, r12 + eor lr, lr, \re, ror #19 + add \rh, \rh, r3 + eor r3, \ra, \rb + add \rh, \rh, lr, ror #6 + + and r3, r3, \rc + eor r12, \ra, \ra, ror #11 + and lr, \ra, \rb + eor r12, r12, \ra, ror #20 + eor lr, lr, r3 + add r3, \rh, lr + add \rh, \rh, \rd + add \rd, r3, r12, ror #2 +.endm + +.macro sha256_main_quadround i, ka, rw + sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8 +.endm + + + .text + .code 32 + .align 2 + .globl sha256_transform + .globl _sha256_transform +#ifdef __ELF__ + .type sha256_transform, %function +#endif +sha256_transform: +_sha256_transform: + stmfd sp!, {r4-r11, lr} + cmp r2, #0 + sub sp, sp, #64*4 + bne sha256_transform_swap + + ldmia r1!, {r4-r11} + stmia sp, {r4-r11} + add r3, sp, #8*4 + ldmia r1, {r4-r11} + stmia r3, {r4-r11} + b sha256_transform_extend + +.macro bswap rd, rn + eor r12, \rn, \rn, ror #16 + bic r12, r12, #0x00ff0000 + mov \rd, \rn, ror #8 + eor \rd, \rd, r12, lsr #8 +.endm + +sha256_transform_swap: + ldmia r1!, {r4-r11} + bswap r4, r4 + bswap r5, r5 + bswap r6, r6 + bswap r7, r7 + bswap r8, r8 + bswap r9, r9 + bswap r10, r10 + bswap r11, r11 + stmia sp, {r4-r11} + add r3, sp, #8*4 + ldmia r1, {r4-r11} + bswap r4, r4 + bswap r5, r5 + bswap r6, r6 + bswap r7, r7 + bswap r8, r8 + bswap r9, r9 + bswap r10, r10 + bswap r11, r11 + stmia r3, {r4-r11} + +sha256_transform_extend: + add r12, sp, #9*4 + ldr r11, [sp, #0*4] + ldmia r12, {r4-r10} + sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5 + sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7 + + ldmia r0, {r4-r11} + sha256_main_quadround 0, sha256_transform_k, sp + sha256_main_quadround 4, sha256_transform_k, sp + sha256_main_quadround 8, sha256_transform_k, sp + sha256_main_quadround 12, sha256_transform_k, sp + sha256_main_quadround 16, sha256_transform_k, sp + sha256_main_quadround 20, sha256_transform_k, sp + sha256_main_quadround 24, sha256_transform_k, sp + sha256_main_quadround 28, sha256_transform_k, sp + b sha256_transform_k_over +sha256_transform_k: + sha256_k +sha256_transform_k_over: + sha256_main_quadround 32, sha256_transform_k, sp + sha256_main_quadround 36, sha256_transform_k, sp + sha256_main_quadround 40, sha256_transform_k, sp + sha256_main_quadround 44, sha256_transform_k, sp + sha256_main_quadround 48, sha256_transform_k, sp + sha256_main_quadround 52, sha256_transform_k, sp + sha256_main_quadround 56, sha256_transform_k, sp + sha256_main_quadround 60, sha256_transform_k, sp + + ldmia r0, {r1, r2, r3, r12} + add r4, r4, r1 + add r5, r5, r2 + add r6, r6, r3 + add r7, r7, r12 + stmia r0!, {r4-r7} + ldmia r0, {r1, r2, r3, r12} + add r8, r8, r1 + add r9, r9, r2 + add r10, r10, r3 + add r11, r11, r12 + stmia r0, {r8-r11} + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + + .text + .code 32 + .align 2 + .globl sha256d_ms + .globl _sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #64*4 + + cmp r0, r0 + + ldr lr, [r1, #3*4] + ldr r6, [r1, #18*4] + ldr r7, [r1, #19*4] + + mov r12, lr, ror #7 + str r6, [sp, #18*4] + eor r12, r12, lr, ror #18 + str r7, [sp, #19*4] + eor r12, r12, lr, lsr #3 + ldr r8, [r1, #20*4] + add r6, r6, r12 + ldr r10, [r1, #22*4] + add r7, r7, lr + str r6, [r1, #18*4] + + mov r12, r6, ror #17 + str r7, [r1, #19*4] + eor r12, r12, r6, ror #19 + str r8, [sp, #20*4] + eor r12, r12, r6, lsr #10 + ldr r4, [r1, #23*4] + add r8, r8, r12 + ldr r5, [r1, #24*4] + + mov r9, r7, ror #17 + str r8, [r1, #20*4] + eor r9, r9, r7, ror #19 + str r10, [sp, #21*4] + eor r9, r9, r7, lsr #10 + str r4, [sp, #22*4] + + mov r12, r8, ror #17 + str r9, [r1, #21*4] + eor r12, r12, r8, ror #19 + str r5, [sp, #23*4] + eor r12, r12, r8, lsr #10 + mov lr, r9, ror #17 + add r10, r10, r12 + ldr r11, [r1, #30*4] + + eor lr, lr, r9, ror #19 + str r10, [r1, #22*4] + eor lr, lr, r9, lsr #10 + str r11, [sp, #24*4] + add r4, r4, lr + + mov r12, r10, ror #17 + str r4, [r1, #23*4] + eor r12, r12, r10, ror #19 + mov lr, r4, ror #17 + eor r12, r12, r10, lsr #10 + eor lr, lr, r4, ror #19 + add r5, r5, r12 + eor lr, lr, r4, lsr #10 + str r5, [r1, #24*4] + add r6, r6, lr + + mov r12, r5, ror #17 + str r6, [r1, #25*4] + eor r12, r12, r5, ror #19 + mov lr, r6, ror #17 + eor r12, r12, r5, lsr #10 + eor lr, lr, r6, ror #19 + add r7, r7, r12 + eor lr, lr, r6, lsr #10 + str r7, [r1, #26*4] + add r8, r8, lr + + mov r12, r7, ror #17 + str r8, [r1, #27*4] + eor r12, r12, r7, ror #19 + mov lr, r8, ror #17 + eor r12, r12, r7, lsr #10 + eor lr, lr, r8, ror #19 + add r9, r9, r12 + eor lr, lr, r8, lsr #10 + str r9, [r1, #28*4] + add r10, r10, lr + + ldr lr, [r1, #31*4] + mov r12, r9, ror #17 + str r10, [r1, #29*4] + eor r12, r12, r9, ror #19 + str lr, [sp, #25*4] + eor r12, r12, r9, lsr #10 + add r11, r11, r12 + add r5, r5, lr + mov r12, r10, ror #17 + add r4, r4, r11 + + ldr r11, [r1, #16*4] + eor r12, r12, r10, ror #19 + str r4, [r1, #30*4] + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + ldr lr, [r1, #17*4] + +sha256d_ms_extend_loop2: + sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5 + sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9 + sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4 + sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6 + sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8 + sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10 + sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5 + sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9 + sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4 + sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6 + sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8 + sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10 + bne sha256d_ms_extend_coda2 + sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5 + sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7 + + ldr r4, [r3, #0*4] + ldr r9, [r3, #1*4] + ldr r10, [r3, #2*4] + ldr r11, [r3, #3*4] + ldr r8, [r3, #4*4] + ldr r5, [r3, #5*4] + ldr r6, [r3, #6*4] + ldr r7, [r3, #7*4] + b sha256d_ms_main_loop1 + +sha256d_ms_main_loop2: + sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 +sha256d_ms_main_loop1: + sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 4, sha256d_ms_k, r1 + sha256_main_quadround 8, sha256d_ms_k, r1 + sha256_main_quadround 12, sha256d_ms_k, r1 + sha256_main_quadround 16, sha256d_ms_k, r1 + sha256_main_quadround 20, sha256d_ms_k, r1 + sha256_main_quadround 24, sha256d_ms_k, r1 + sha256_main_quadround 28, sha256d_ms_k, r1 + b sha256d_ms_k_over +sha256d_ms_k: + sha256_k +sha256d_ms_k_over: + sha256_main_quadround 32, sha256d_ms_k, r1 + sha256_main_quadround 36, sha256d_ms_k, r1 + sha256_main_quadround 40, sha256d_ms_k, r1 + sha256_main_quadround 44, sha256d_ms_k, r1 + sha256_main_quadround 48, sha256d_ms_k, r1 + sha256_main_quadround 52, sha256d_ms_k, r1 + sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + bne sha256d_ms_finish + sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 60, sha256d_ms_k, r1 + + ldmia r2!, {r3, r12, lr} + add r4, r4, r3 + add r5, r5, r12 + add r6, r6, lr + stmia sp, {r4-r6} + ldmia r2, {r3, r4, r5, r6, r12} + add lr, sp, #3*4 + add r7, r7, r3 + add r8, r8, r4 + add r9, r9, r5 + add r10, r10, r6 + add r11, r11, r12 + add r12, sp, #18*4 + stmia lr!, {r7-r11} + + ldmia r12, {r4-r11} + str r4, [r1, #18*4] + str r5, [r1, #19*4] + str r6, [r1, #20*4] + str r7, [r1, #22*4] + str r8, [r1, #23*4] + str r9, [r1, #24*4] + str r10, [r1, #30*4] + str r11, [r1, #31*4] + + mov r3, #0x80000000 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 + mov r10, #0x00000100 + stmia lr, {r3-r10} + + ldr lr, [sp, #1*4] + movs r1, sp + ldr r4, [sp, #0*4] + + ldr r11, [sp, #2*4] + mov r12, lr, ror #7 + eor r12, r12, lr, ror #18 + add r5, lr, #0x00a00000 + eor r12, r12, lr, lsr #3 + mov lr, r11, ror #7 + add r4, r4, r12 + eor lr, lr, r11, ror #18 + str r4, [sp, #16*4] + eor lr, lr, r11, lsr #3 + mov r12, r4, ror #17 + add r5, r5, lr + ldr lr, [sp, #3*4] + + str r5, [sp, #17*4] + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r11, r11, r12 + eor r6, r6, lr, lsr #3 + mov r12, r5, ror #17 + add r6, r6, r11 + ldr r11, [sp, #4*4] + + str r6, [sp, #18*4] + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + mov r12, r6, ror #17 + add r7, r7, lr + ldr lr, [sp, #5*4] + + str r7, [sp, #19*4] + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r11, r11, r12 + eor r8, r8, lr, lsr #3 + mov r12, r7, ror #17 + add r8, r8, r11 + ldr r11, [sp, #6*4] + + str r8, [sp, #20*4] + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + mov r12, r8, ror #17 + add r9, r9, lr + ldr lr, [sp, #7*4] + + str r9, [sp, #21*4] + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r11, r11, r12 + eor r10, r10, lr, lsr #3 + mov r12, r9, ror #17 + add r11, r11, #0x00000100 + add lr, lr, r4 + add r10, r10, r11 + + eor r12, r12, r9, ror #19 + str r10, [sp, #22*4] + add lr, lr, #0x11000000 + eor r12, r12, r9, lsr #10 + add lr, lr, r12 + mov r12, r10, ror #17 + add r4, lr, #0x00002000 + eor r12, r12, r10, ror #19 + str r4, [sp, #23*4] + add r5, r5, #0x80000000 + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + + mov r12, r4, ror #17 + str r5, [sp, #24*4] + eor r12, r12, r4, ror #19 + mov r11, r5, ror #17 + eor r12, r12, r4, lsr #10 + eor r11, r11, r5, ror #19 + add r6, r6, r12 + eor r11, r11, r5, lsr #10 + str r6, [sp, #25*4] + add r7, r7, r11 + + mov r12, r6, ror #17 + str r7, [sp, #26*4] + eor r12, r12, r6, ror #19 + mov r11, r7, ror #17 + eor r12, r12, r6, lsr #10 + eor r11, r11, r7, ror #19 + add r8, r8, r12 + eor r11, r11, r7, lsr #10 + str r8, [sp, #27*4] + add r9, r9, r11 + + mov lr, r8, ror #17 + mov r12, r9, ror #17 + str r9, [sp, #28*4] + add r4, r4, #0x00400000 + eor lr, lr, r8, ror #19 + eor r12, r12, r9, ror #19 + eor lr, lr, r8, lsr #10 + eor r12, r12, r9, lsr #10 + add r4, r4, #0x00000022 + add r10, r10, lr + add r4, r4, r12 + ldr r11, [sp, #16*4] + + add r5, r5, #0x00000100 + str r4, [sp, #30*4] + mov lr, r11, ror #7 + str r10, [sp, #29*4] + mov r12, r10, ror #17 + eor lr, lr, r11, ror #18 + eor r12, r12, r10, ror #19 + eor lr, lr, r11, lsr #3 + eor r12, r12, r10, lsr #10 + add r5, r5, lr + ldr lr, [r1, #17*4] + add r5, r5, r12 + + b sha256d_ms_extend_loop2 + +sha256d_ms_extend_coda2: + str r5, [r1, #(44+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + mov r6, lr, ror #7 + eor r12, r12, r4, ror #19 + eor r6, r6, lr, ror #18 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, lsr #3 + add r12, r12, r11 + add r6, r6, r12 + str r6, [r1, #(44+16)*4] + + adr r2, sha256d_ms_h + ldmia r2, {r4-r11} + b sha256d_ms_main_loop2 + +sha256d_ms_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh + ldr r12, [\rw, #(\i)*4] + and r3, \rf, \re + bic lr, \rg, \re + add \rh, \rh, \rd + orr lr, lr, r3 + ldr r3, \ka + (\i)*4 + add \rh, \rh, lr + eor lr, \re, \re, ror #5 + add \rh, \rh, r12 + eor lr, lr, \re, ror #19 + add \rh, \rh, r3 + add \rh, \rh, lr, ror #6 +.endm + +sha256d_ms_finish: + sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10 + sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9 + sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8 + ldr r5, [r2, #7*4] + sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11 + + add r11, r11, r5 + str r11, [r0, #7*4] + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + +#ifdef __ARM_NEON__ + + .text + .code 32 + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: + adr r12, sha256_4h + vldmia r12, {q8-q15} + vstmia r0, {q8-q15} + bx lr + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + +.macro sha256_4k + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 +.endm + +.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz + vadd.u32 q5, q5, \ra + veor.u32 q4, q4, q0 + vshr.u32 q0, \ry, #19 + vshl.u32 q1, \ry, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 \ra, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 \ra, \ra, q0 + vshr.u32 q1, \ry, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 \ra, \ra, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 \ra, \ra, q1 + vadd.u32 q4, q4, q5 + veor.u32 \ra, \ra, q0 + vld1.u32 {q5}, [\rr]! + vadd.u32 \ra, \ra, q4 + + vshr.u32 q4, \rz, #17 + vshl.u32 q0, \rz, #32-17 + vadd.u32 q6, q6, \rb + vst1.u32 {\ra}, [\rw]! + veor.u32 q4, q4, q0 + vshr.u32 q0, \rz, #19 + vshl.u32 q1, \rz, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 \rb, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, \rz, #10 + veor.u32 \rb, \rb, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 \rb, \rb, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 \rb, \rb, q1 + vadd.u32 q1, q6, q4 + veor.u32 \rb, \rb, q0 +.endm + +.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz + vld1.u32 {q6}, [\rr]! + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vld1.u32 {q6}, [\rr]! + vadd.u32 \rb, \rb, q1 +.endm + +.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + vst1.u32 {\rz}, [\rw]! + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vld1.u32 {q6}, [\rr]! + vadd.u32 \rb, \rb, q1 +.endm + +.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + vst1.u32 {\rz}, [\rw]! + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vadd.u32 \rb, \rb, q1 + vst1.u32 {\rb}, [\rw]! +.endm + +.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh + vld1.u32 {q8}, [\rw]! + vand.u32 q9, \rf, \re + vbic.u32 q10, \rg, \re + vshr.u32 q11, \re, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [\rk]! + vadd.u32 \rh, \rh, q10 + vshl.u32 q12, \re, #32-5 + veor.u32 q10, \re, q11 + vshr.u32 q11, \re, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, \re, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 \rh, \rh, q8 + veor.u32 q10, q10, q12 + vadd.u32 \rh, \rh, q9 + veor.u32 q9, \ra, \rb + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 \rh, \rh, q11 + + vshr.u32 q11, \ra, #11 + vshl.u32 q12, \ra, #32-11 + veor.u32 q8, \ra, q11 + vand.u32 q10, \ra, \rb + veor.u32 q8, q8, q12 + vshr.u32 q11, \ra, #20 + vshl.u32 q12, \ra, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, \rc + veor.u32 q8, q8, q12 + vadd.u32 \rh, \rh, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, \rh, q10 + vadd.u32 q12, q12, q11 + vadd.u32 \rh, \rh, \rd + vadd.u32 \rd, q9, q12 +.endm + +.macro sha256_4way_main_quadround i, rk, rw + sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4 +.endm + + + .text + .code 32 + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + cmp r2, #0 + bne sha256_transform_4way_swap + + vldmia r1!, {q0-q7} + vstmia sp, {q0-q7} + add r3, sp, #8*16 + vldmia r1, {q8-q15} + vstmia r3, {q8-q15} + b sha256_transform_4way_extend + +sha256_transform_4way_swap: + vldmia r1!, {q0-q7} + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 + vldmia r1, {q8-q15} + vrev32.8 q4, q4 + vrev32.8 q5, q5 + vrev32.8 q6, q6 + vrev32.8 q7, q7 + vstmia sp, {q0-q7} + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 + vrev32.8 q12, q12 + vrev32.8 q13, q13 + vrev32.8 q14, q14 + vrev32.8 q15, q15 + add r3, sp, #8*16 + vstmia r3, {q8-q15} + +sha256_transform_4way_extend: + add r1, sp, #1*16 + add r2, sp, #16*16 + vmov.u32 q5, q0 + sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12 + + vldmia r0, {q0-q7} + adr r4, sha256_transform_4way_4k + b sha256_transform_4way_4k_over + .align 4 +sha256_transform_4way_4k: + sha256_4k +sha256_transform_4way_4k_over: + sha256_4way_main_quadround 0, r4, sp + sha256_4way_main_quadround 4, r4, sp + sha256_4way_main_quadround 8, r4, sp + sha256_4way_main_quadround 12, r4, sp + sha256_4way_main_quadround 16, r4, sp + sha256_4way_main_quadround 20, r4, sp + sha256_4way_main_quadround 24, r4, sp + sha256_4way_main_quadround 28, r4, sp + sha256_4way_main_quadround 32, r4, sp + sha256_4way_main_quadround 36, r4, sp + sha256_4way_main_quadround 40, r4, sp + sha256_4way_main_quadround 44, r4, sp + sha256_4way_main_quadround 48, r4, sp + sha256_4way_main_quadround 52, r4, sp + sha256_4way_main_quadround 56, r4, sp + sha256_4way_main_quadround 60, r4, sp + + vldmia r0, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 + vadd.u32 q3, q3, q11 + vadd.u32 q4, q4, q12 + vadd.u32 q5, q5, q13 + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + vstmia r0, {q0-q7} + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + + add r4, r1, #3*16 + vld1.u32 {q6}, [r4]! + add r1, r1, #18*16 + vldmia r1, {q11-q13} + cmp r0, r0 + + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + vshr.u32 q1, q6, #18 + veor.u32 q10, q10, q0 + vshl.u32 q0, q6, #32-18 + veor.u32 q10, q10, q1 + vshr.u32 q1, q6, #3 + veor.u32 q10, q10, q0 + vstmia sp!, {q11-q13} + veor.u32 q4, q10, q1 + vadd.u32 q12, q12, q6 + vadd.u32 q11, q11, q4 + + vshr.u32 q14, q12, #17 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + vshl.u32 q0, q12, #32-17 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vadd.u32 q13, q13, q4 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q14, q14, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q1, q12, #10 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + veor.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + vld1.u32 {q15}, [r1] + veor.u32 q4, q4, q1 + vst1.u32 {q15}, [sp]! + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q9}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vst1.u32 {q9}, [sp]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q15, #17 + vadd.u32 q9, q9, q5 + vshl.u32 q0, q15, #32-17 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q10}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vst1.u32 {q10}, [sp]! + veor.u32 q4, q4, q1 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q4 + vshr.u32 q4, q9, #17 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q9, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q10}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q2 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q2 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q5, q4, q1 + add r4, r4, #12*16 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q15, q15, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vld1.u32 {q2}, [r1] + veor.u32 q4, q4, q1 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q9, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q9, q9, q2 + vshr.u32 q4, q15, #17 + vshr.u32 q2, q15, #19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q2 + vshr.u32 q0, q15, #10 + veor.u32 q4, q4, q1 + vld1.u32 {q5-q6}, [r4]! + veor.u32 q4, q4, q0 + vld1.u32 {q2}, [r1] + vadd.u32 q10, q10, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q10, q10, q2 + + sub sp, sp, #8*16 + +sha256d_ms_4way_extend_loop2: + sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12 + bne sha256d_ms_4way_extend_coda2 + + vldmia r3!, {q4-q7} + vldmia r3, {q0-q3} + vswp q0, q4 + adr r3, sha256d_ms_4way_4k+3*16 + sub r1, r1, #(64-3)*16 + b sha256d_ms_4way_main_loop1 + + .align 4 +sha256d_ms_4way_4k: + sha256_4k + +sha256d_ms_4way_main_loop2: + sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 +sha256d_ms_4way_main_loop1: + sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 + sha256_4way_main_quadround 4, r3, r1 + sha256_4way_main_quadround 8, r3, r1 + sha256_4way_main_quadround 12, r3, r1 + sha256_4way_main_quadround 16, r3, r1 + sha256_4way_main_quadround 20, r3, r1 + sha256_4way_main_quadround 24, r3, r1 + sha256_4way_main_quadround 28, r3, r1 + sha256_4way_main_quadround 32, r3, r1 + sha256_4way_main_quadround 36, r3, r1 + sha256_4way_main_quadround 40, r3, r1 + sha256_4way_main_quadround 44, r3, r1 + sha256_4way_main_quadround 48, r3, r1 + sha256_4way_main_quadround 52, r3, r1 + sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + bne sha256d_ms_4way_finish + sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 + sha256_4way_main_quadround 60, r3, r1 + + vldmia r2, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 + vadd.u32 q3, q3, q11 + vadd.u32 q4, q4, q12 + vadd.u32 q5, q5, q13 + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + + vldmia sp, {q8-q15} + sub r1, r1, #(64-18)*16 + vstmia r1, {q8-q10} + add r1, r1, #4*16 + vstmia r1, {q11-q13} + add r1, r1, #8*16 + vstmia r1, {q14-q15} + + vstmia sp, {q0-q7} + vmov.u32 q8, #0x80000000 + vmov.u32 q9, #0 + vmov.u32 q10, #0 + vmov.u32 q11, #0 + vmov.u32 q12, #0 + vmov.u32 q13, #0 + vmov.u32 q14, #0 + vmov.u32 q15, #0x00000100 + add r1, sp, #8*16 + vstmia r1!, {q8-q15} + adds r4, sp, #2*16 + + vshr.u32 q9, q1, #7 + vshl.u32 q2, q1, #32-7 + vshr.u32 q4, q1, #18 + veor.u32 q9, q9, q2 + vshl.u32 q3, q1, #32-18 + veor.u32 q9, q9, q4 + vshr.u32 q2, q1, #3 + veor.u32 q9, q9, q3 + vld1.u32 {q5}, [r4]! + veor.u32 q9, q9, q2 + vmov.u32 q7, #0x00a00000 + vadd.u32 q9, q9, q0 + vshr.u32 q10, q5, #7 + vshl.u32 q0, q5, #32-7 + vshl.u32 q3, q5, #32-18 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q10, q10, q3 + vst1.u32 {q9}, [r1]! + vadd.u32 q3, q1, q7 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #3 + vld1.u32 {q6}, [r4]! + veor.u32 q10, q10, q0 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q3 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q11, q11, q4 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q10}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vst1.u32 {q11}, [r1]! + veor.u32 q12, q12, q1 + vshr.u32 q0, q5, #3 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q12, q12, q1 + vld1.u32 {q6}, [r4]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q13, q13, q4 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q0, q5, #3 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q14, q14, q1 + vld1.u32 {q6}, [r4]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vmov.u32 q5, #0x80000000 + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q14}, [r1]! + vmov.u32 q7, #0x11000000 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + vadd.u32 q6, q6, q7 + vmov.u32 q2, #0x00002000 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vadd.u32 q6, q6, q2 + veor.u32 q1, q4, q1 + add r4, r4, #8*16 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q6, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vshl.u32 q0, q9, #32-17 + veor.u32 q10, q4, q1 + vshr.u32 q4, q9, #17 + vadd.u32 q10, q10, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q10}, [r1]! + veor.u32 q1, q4, q0 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q1, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q12}, [r1]! + veor.u32 q1, q4, q0 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q1, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q0 + vmov.u32 q6, #0x00000100 + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vmov.u32 q7, #0x00400000 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vadd.u32 q9, q9, q7 + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vmov.u32 q2, #0x00000022 + veor.u32 q4, q4, q1 + vadd.u32 q9, q9, q2 + vld1.u32 {q5}, [r4]! + vadd.u32 q9, q9, q4 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q10, q10, q1 + + b sha256d_ms_4way_extend_loop2 + + .align 4 +sha256d_ms_4way_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + +sha256d_ms_4way_extend_coda2: + adr r4, sha256d_ms_4way_4h + mov r1, sp + vldmia r4, {q0-q7} + vmov.u32 q15, q7 + sub r3, r3, #64*16 + b sha256d_ms_4way_main_loop2 + +.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh + vld1.u32 {q8}, [\rw]! + vand.u32 q9, \rf, \re + vbic.u32 q10, \rg, \re + vshr.u32 q11, \re, #5 + vorr.u32 q10, q10, q9 + vshl.u32 q12, \re, #32-5 + vadd.u32 \rh, \rh, q10 + veor.u32 q10, \re, q11 + vshr.u32 q11, \re, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, \re, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 \rh, \rh, q8 + veor.u32 q10, q10, q12 + vld1.u32 {q9}, [\rk]! + vadd.u32 \rh, \rh, \rd + vshr.u32 q11, q10, #6 + vadd.u32 \rh, \rh, q9 + vshl.u32 q13, q10, #32-6 + vadd.u32 \rh, \rh, q11 + vadd.u32 \rh, \rh, q13 +.endm + +sha256d_ms_4way_finish: + sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6 + sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5 + sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4 + sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7 + + vadd.u32 q7, q7, q15 + add r0, r0, #7*16 + vst1.u32 {q7}, [r0] + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256_use_4way + .globl _sha256_use_4way +#ifdef __ELF__ + .type sha256_use_4way, %function +#endif +sha256_use_4way: +_sha256_use_4way: + mov r0, #1 + bx lr + +#endif /* __ARM_NEON__ */ + +#endif diff --git a/sha2-ppc.S b/sha2-ppc.S index a0b60d2ac..b54bdf5da 100644 --- a/sha2-ppc.S +++ b/sha2-ppc.S @@ -138,77 +138,9 @@ T.sha256_k: #endif -.macro sha256_extend_doubleround i, rw, wo, ra, rb, ry, rz - lwz r14, \wo+(\i+1)*4(\rw) - rotrwi r12, \ry, 17 - rotrwi r13, \ry, 19 - add r11, r11, \ra - xor r12, r12, r13 - srwi r13, \ry, 10 - rotrwi \ra, r14, 7 - xor r12, r12, r13 - rotrwi r13, r14, 18 - add r12, r12, r11 - xor \ra, \ra, r13 - srwi r13, r14, 3 - lwz r11, \wo+(\i+2)*4(\rw) - xor \ra, \ra, r13 - rotrwi r13, \rz, 19 - add \ra, \ra, r12 - - rotrwi r12, \rz, 17 - add r14, r14, \rb - xor r12, r12, r13 - srwi r13, \rz, 10 - rotrwi \rb, r11, 7 - xor r12, r12, r13 - rotrwi r13, r11, 18 - stw \ra, \wo+(\i+16)*4(\rw) - xor \rb, \rb, r13 - srwi r13, r11, 3 - add r14, r14, r12 - xor \rb, \rb, r13 - add \rb, \rb, r14 - stw \rb, \wo+(\i+17)*4(\rw) -.endm - -.macro sha256_main_round i, rk, rw, wo, ra, rb, rc, rd, re, rf, rg, rh - lwz r12, \wo+(\i)*4(\rw) - and r13, \rf, \re - andc r14, \rg, \re - lwz r15, (\i)*4(\rk) - or r14, r14, r13 - rotrwi r13, \re, 5 - add \rh, \rh, r14 - xor r14, \re, r13 - rotrwi r13, \re, 19 - add \rh, \rh, r12 - xor r14, r14, r13 - add \rh, \rh, r15 - rotrwi r13, r14, 6 - xor r15, \ra, \rb - add \rh, \rh, r13 - rotrwi r13, \ra, 11 - and r15, r15, \rc - xor r12, \ra, r13 - rotrwi r13, \ra, 20 - and r14, \ra, \rb - xor r12, r12, r13 - xor r14, r14, r15 - rotrwi r13, r12, 2 - add r15, \rh, r14 - add \rh, \rh, \rd - add \rd, r15, r13 -.endm -.macro sha256_main_quadround i, rk, rw, wo - sha256_main_round \i+0, \rk, \rw, \wo, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round \i+1, \rk, \rw, \wo, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round \i+2, \rk, \rw, \wo, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round \i+3, \rk, \rw, \wo, r5, r6, r7, r4, r9, r10, r11, r8 -.endm #ifdef _AIX @@ -312,1672 +244,14905 @@ sha256_transform_swap: stw r10, 8*4+15*4(r1) sha256_transform_extend: - sha256_extend_doubleround 0, r1, 8*4, r4, r5, r9, r10 - sha256_extend_doubleround 2, r1, 8*4, r6, r7, r4, r5 - sha256_extend_doubleround 4, r1, 8*4, r8, r9, r6, r7 - sha256_extend_doubleround 6, r1, 8*4, r10, r4, r8, r9 - sha256_extend_doubleround 8, r1, 8*4, r5, r6, r10, r4 - sha256_extend_doubleround 10, r1, 8*4, r7, r8, r5, r6 - sha256_extend_doubleround 12, r1, 8*4, r9, r10, r7, r8 - sha256_extend_doubleround 14, r1, 8*4, r4, r5, r9, r10 - sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5 - sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7 - sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9 - sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4 - sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6 - sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8 - sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10 - sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5 - sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7 - sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9 - sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4 - sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6 - sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8 - sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10 - sha256_extend_doubleround 44, r1, 8*4, r6, r7, r4, r5 - sha256_extend_doubleround 46, r1, 8*4, r8, r9, r6, r7 - - lwz r4, 0*4(r3) - lwz r5, 1*4(r3) - lwz r6, 2*4(r3) - lwz r7, 3*4(r3) - lwz r8, 4*4(r3) - lwz r9, 5*4(r3) - lwz r10, 6*4(r3) - lwz r11, 7*4(r3) -#ifdef _AIX - ld r16, T.sha256_k(r2) -#else - lis r16, HI(sha256_k) - addi r16, r16, LO(sha256_k) -#endif - sha256_main_quadround 0, r16, r1, 8*4 - sha256_main_quadround 4, r16, r1, 8*4 - sha256_main_quadround 8, r16, r1, 8*4 - sha256_main_quadround 12, r16, r1, 8*4 - sha256_main_quadround 16, r16, r1, 8*4 - sha256_main_quadround 20, r16, r1, 8*4 - sha256_main_quadround 24, r16, r1, 8*4 - sha256_main_quadround 28, r16, r1, 8*4 - sha256_main_quadround 32, r16, r1, 8*4 - sha256_main_quadround 36, r16, r1, 8*4 - sha256_main_quadround 40, r16, r1, 8*4 - sha256_main_quadround 44, r16, r1, 8*4 - sha256_main_quadround 48, r16, r1, 8*4 - sha256_main_quadround 52, r16, r1, 8*4 - sha256_main_quadround 56, r16, r1, 8*4 - sha256_main_quadround 60, r16, r1, 8*4 - - lwz r12, 0*4(r3) - lwz r13, 1*4(r3) - lwz r14, 2*4(r3) - lwz r15, 3*4(r3) + lwz r14, 8*4+(0+1)*4(r1) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(0+2)*4(r1) + xor r4, r4, r13 + rotrwi r13, r10, 19 add r4, r4, r12 - add r5, r5, r13 - add r6, r6, r14 - add r7, r7, r15 - stw r4, 0*4(r3) - stw r5, 1*4(r3) - stw r6, 2*4(r3) - stw r7, 3*4(r3) - lwz r12, 4*4(r3) - lwz r13, 5*4(r3) - lwz r14, 6*4(r3) - lwz r15, 7*4(r3) - add r8, r8, r12 - add r9, r9, r13 - add r10, r10, r14 - add r11, r11, r15 - stw r8, 4*4(r3) - stw r9, 5*4(r3) - stw r10, 6*4(r3) - stw r11, 7*4(r3) - - ld r13, 2*4(r1) - ld r14, 4*4(r1) - ld r15, 6*4(r1) - ld r16, 72*4(r1) - addi r1, r1, 76*4 - blr - - .align 2 - .globl sha256d_ms - .globl _sha256d_ms - .globl .sha256d_ms -#ifdef __ELF__ - .type sha256d_ms, %function -#endif -sha256d_ms: -_sha256d_ms: -.sha256d_ms: - stdu r1, -80*4(r1) - std r13, 2*4(r1) - std r14, 4*4(r1) - std r15, 6*4(r1) - std r16, 72*4(r1) - std r17, 74*4(r1) - std r18, 76*4(r1) - - mr r17, r4 - mr r18, r5 - mr r16, r6 - - lwz r14, 3*4(r17) - lwz r6, 18*4(r17) - lwz r7, 19*4(r17) - - rotrwi r12, r14, 7 - rotrwi r13, r14, 18 - stw r6, 8*4+18*4(r1) + rotrwi r12, r10, 17 + add r14, r14, r5 xor r12, r12, r13 - srwi r13, r14, 3 - stw r7, 8*4+19*4(r1) + srwi r13, r10, 10 + rotrwi r5, r11, 7 xor r12, r12, r13 - lwz r8, 20*4(r17) + rotrwi r13, r11, 18 + stw r4, 8*4+(0+16)*4(r1) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 8*4+(0+17)*4(r1) + lwz r14, 8*4+(2+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(2+2)*4(r1) + xor r6, r6, r13 + rotrwi r13, r5, 19 add r6, r6, r12 - lwz r10, 22*4(r17) + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 8*4+(2+16)*4(r1) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 add r7, r7, r14 - stw r6, 18*4(r17) - + stw r7, 8*4+(2+17)*4(r1) + lwz r14, 8*4+(4+1)*4(r1) rotrwi r12, r6, 17 rotrwi r13, r6, 19 - stw r7, 19*4(r17) + add r11, r11, r8 xor r12, r12, r13 srwi r13, r6, 10 - stw r8, 8*4+20*4(r1) + rotrwi r8, r14, 7 xor r12, r12, r13 - lwz r4, 23*4(r17) - add r8, r8, r12 - lwz r5, 24*4(r17) - - rotrwi r9, r7, 17 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(4+2)*4(r1) + xor r8, r8, r13 rotrwi r13, r7, 19 - stw r8, 20*4(r17) - xor r9, r9, r13 + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 srwi r13, r7, 10 - stw r10, 8*4+21*4(r1) + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 8*4+(4+16)*4(r1) xor r9, r9, r13 - stw r4, 8*4+22*4(r1) - + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 8*4+(4+17)*4(r1) + lwz r14, 8*4+(6+1)*4(r1) rotrwi r12, r8, 17 rotrwi r13, r8, 19 - stw r9, 21*4(r17) + add r11, r11, r10 xor r12, r12, r13 srwi r13, r8, 10 - stw r5, 8*4+23*4(r1) + rotrwi r10, r14, 7 xor r12, r12, r13 - rotrwi r14, r9, 17 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r10, r10, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(6+2)*4(r1) + xor r10, r10, r13 rotrwi r13, r9, 19 add r10, r10, r12 - lwz r11, 30*4(r17) - - xor r14, r14, r13 + + rotrwi r12, r9, 17 + add r14, r14, r4 + xor r12, r12, r13 srwi r13, r9, 10 - stw r10, 22*4(r17) - xor r14, r14, r13 - stw r11, 8*4+24*4(r1) + rotrwi r4, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 8*4+(6+16)*4(r1) + xor r4, r4, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r4, r4, r13 add r4, r4, r14 - + stw r4, 8*4+(6+17)*4(r1) + lwz r14, 8*4+(8+1)*4(r1) rotrwi r12, r10, 17 rotrwi r13, r10, 19 - stw r4, 23*4(r17) + add r11, r11, r5 xor r12, r12, r13 srwi r13, r10, 10 - rotrwi r14, r4, 17 + rotrwi r5, r14, 7 xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(8+2)*4(r1) + xor r5, r5, r13 rotrwi r13, r4, 19 - xor r14, r14, r13 - srwi r13, r4, 10 add r5, r5, r12 - xor r14, r14, r13 - stw r5, 24*4(r17) + + rotrwi r12, r4, 17 + add r14, r14, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 8*4+(8+16)*4(r1) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 add r6, r6, r14 - + stw r6, 8*4+(8+17)*4(r1) + lwz r14, 8*4+(10+1)*4(r1) rotrwi r12, r5, 17 rotrwi r13, r5, 19 - stw r6, 25*4(r17) + add r11, r11, r7 xor r12, r12, r13 srwi r13, r5, 10 - rotrwi r14, r6, 17 + rotrwi r7, r14, 7 xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(10+2)*4(r1) + xor r7, r7, r13 rotrwi r13, r6, 19 - xor r14, r14, r13 - srwi r13, r6, 10 add r7, r7, r12 - xor r14, r14, r13 - stw r7, 26*4(r17) + + rotrwi r12, r6, 17 + add r14, r14, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 8*4+(10+16)*4(r1) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 add r8, r8, r14 - + stw r8, 8*4+(10+17)*4(r1) + lwz r14, 8*4+(12+1)*4(r1) rotrwi r12, r7, 17 rotrwi r13, r7, 19 - stw r8, 27*4(r17) + add r11, r11, r9 xor r12, r12, r13 srwi r13, r7, 10 - rotrwi r14, r8, 17 + rotrwi r9, r14, 7 xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(12+2)*4(r1) + xor r9, r9, r13 rotrwi r13, r8, 19 - xor r14, r14, r13 - srwi r13, r8, 10 add r9, r9, r12 - xor r14, r14, r13 - stw r9, 28*4(r17) - add r10, r10, r14 - - lwz r14, 31*4(r17) - rotrwi r12, r9, 17 - rotrwi r13, r9, 19 - stw r10, 29*4(r17) + + rotrwi r12, r8, 17 + add r14, r14, r10 xor r12, r12, r13 - srwi r13, r9, 10 - stw r14, 8*4+25*4(r1) + srwi r13, r8, 10 + rotrwi r10, r11, 7 xor r12, r12, r13 - add r11, r11, r12 - add r5, r5, r14 - rotrwi r12, r10, 17 - rotrwi r13, r10, 19 - add r4, r4, r11 - - lwz r11, 16*4(r17) + rotrwi r13, r11, 18 + stw r9, 8*4+(12+16)*4(r1) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 8*4+(12+17)*4(r1) + lwz r14, 8*4+(14+1)*4(r1) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 xor r12, r12, r13 - srwi r13, r10, 10 - stw r4, 30*4(r17) + srwi r13, r9, 10 + rotrwi r4, r14, 7 xor r12, r12, r13 - add r5, r5, r12 - stw r5, 31*4(r17) - - sha256_extend_doubleround 16, r17, 0, r6, r7, r4, r5 - sha256_extend_doubleround 18, r17, 0, r8, r9, r6, r7 - sha256_extend_doubleround 20, r17, 0, r10, r4, r8, r9 - sha256_extend_doubleround 22, r17, 0, r5, r6, r10, r4 - sha256_extend_doubleround 24, r17, 0, r7, r8, r5, r6 - sha256_extend_doubleround 26, r17, 0, r9, r10, r7, r8 - sha256_extend_doubleround 28, r17, 0, r4, r5, r9, r10 - sha256_extend_doubleround 30, r17, 0, r6, r7, r4, r5 - sha256_extend_doubleround 32, r17, 0, r8, r9, r6, r7 - sha256_extend_doubleround 34, r17, 0, r10, r4, r8, r9 - sha256_extend_doubleround 36, r17, 0, r5, r6, r10, r4 - sha256_extend_doubleround 38, r17, 0, r7, r8, r5, r6 - sha256_extend_doubleround 40, r17, 0, r9, r10, r7, r8 - sha256_extend_doubleround 42, r17, 0, r4, r5, r9, r10 - sha256_extend_doubleround 44, r17, 0, r6, r7, r4, r5 - sha256_extend_doubleround 46, r17, 0, r8, r9, r6, r7 - - lwz r4, 0*4(r16) - lwz r9, 1*4(r16) - lwz r10, 2*4(r16) - lwz r11, 3*4(r16) - lwz r8, 4*4(r16) - lwz r5, 5*4(r16) - lwz r6, 6*4(r16) - lwz r7, 7*4(r16) -#ifdef _AIX - ld r16, T.sha256_k(r2) -#else - lis r16, HI(sha256_k) - addi r16, r16, LO(sha256_k) -#endif - - sha256_main_round 3, r16, r17, 0, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 4, r16, r17, 0 - sha256_main_quadround 8, r16, r17, 0 - sha256_main_quadround 12, r16, r17, 0 - sha256_main_quadround 16, r16, r17, 0 - sha256_main_quadround 20, r16, r17, 0 - sha256_main_quadround 24, r16, r17, 0 - sha256_main_quadround 28, r16, r17, 0 - sha256_main_quadround 32, r16, r17, 0 - sha256_main_quadround 36, r16, r17, 0 - sha256_main_quadround 40, r16, r17, 0 - sha256_main_quadround 44, r16, r17, 0 - sha256_main_quadround 48, r16, r17, 0 - sha256_main_quadround 52, r16, r17, 0 - sha256_main_quadround 56, r16, r17, 0 - sha256_main_quadround 60, r16, r17, 0 - - lwz r12, 0*4(r18) - lwz r13, 1*4(r18) - lwz r14, 2*4(r18) - lwz r15, 3*4(r18) + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(14+2)*4(r1) + xor r4, r4, r13 + rotrwi r13, r10, 19 add r4, r4, r12 - add r5, r5, r13 - add r6, r6, r14 - add r7, r7, r15 - stw r4, 8*4+0*4(r1) - stw r5, 8*4+1*4(r1) - stw r6, 8*4+2*4(r1) - stw r7, 8*4+3*4(r1) - lwz r12, 4*4(r18) - lwz r13, 5*4(r18) - lwz r14, 6*4(r18) - lwz r15, 7*4(r18) - add r8, r8, r12 - add r9, r9, r13 - add r10, r10, r14 - add r11, r11, r15 - stw r8, 8*4+4*4(r1) - stw r9, 8*4+5*4(r1) - stw r10, 8*4+6*4(r1) - stw r11, 8*4+7*4(r1) - lwz r4, 8*4+18*4(r1) - lwz r5, 8*4+19*4(r1) - lwz r6, 8*4+20*4(r1) - lwz r7, 8*4+21*4(r1) - lwz r8, 8*4+22*4(r1) - lwz r9, 8*4+23*4(r1) - lwz r10, 8*4+24*4(r1) - lwz r11, 8*4+25*4(r1) - stw r4, 18*4(r17) - stw r5, 19*4(r17) - stw r6, 20*4(r17) - stw r7, 22*4(r17) - stw r8, 23*4(r17) - stw r9, 24*4(r17) - stw r10, 30*4(r17) - stw r11, 31*4(r17) - - lis r8, 0x8000 - li r9, 0 - li r10, 0x0100 - - lwz r14, 8*4+1*4(r1) - lwz r4, 8*4+0*4(r1) - - lwz r11, 8*4+2*4(r1) - rotrwi r12, r14, 7 - rotrwi r13, r14, 18 - - stw r8, 8*4+8*4(r1) - stw r9, 8*4+9*4(r1) - stw r9, 8*4+10*4(r1) - stw r9, 8*4+11*4(r1) - stw r9, 8*4+12*4(r1) - stw r9, 8*4+13*4(r1) - stw r9, 8*4+14*4(r1) - stw r10, 8*4+15*4(r1) - + rotrwi r12, r10, 17 + add r14, r14, r5 xor r12, r12, r13 - srwi r13, r14, 3 - addis r5, r14, 0x00a0 + srwi r13, r10, 10 + rotrwi r5, r11, 7 xor r12, r12, r13 - rotrwi r14, r11, 7 rotrwi r13, r11, 18 - add r4, r4, r12 - xor r14, r14, r13 + stw r4, 8*4+(14+16)*4(r1) + xor r5, r5, r13 srwi r13, r11, 3 - stw r4, 8*4+16*4(r1) - xor r14, r14, r13 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 8*4+(14+17)*4(r1) + lwz r14, 8*4+(16+1)*4(r1) rotrwi r12, r4, 17 rotrwi r13, r4, 19 - add r5, r5, r14 - lwz r14, 8*4+3*4(r1) - - stw r5, 8*4+17*4(r1) + add r11, r11, r6 xor r12, r12, r13 srwi r13, r4, 10 rotrwi r6, r14, 7 xor r12, r12, r13 rotrwi r13, r14, 18 + add r12, r12, r11 xor r6, r6, r13 srwi r13, r14, 3 - add r11, r11, r12 + lwz r11, 8*4+(16+2)*4(r1) xor r6, r6, r13 - rotrwi r12, r5, 17 rotrwi r13, r5, 19 - add r6, r6, r11 - lwz r11, 8*4+4*4(r1) - - stw r6, 8*4+18*4(r1) + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 xor r12, r12, r13 srwi r13, r5, 10 rotrwi r7, r11, 7 xor r12, r12, r13 rotrwi r13, r11, 18 + stw r6, 8*4+(16+16)*4(r1) xor r7, r7, r13 srwi r13, r11, 3 add r14, r14, r12 xor r7, r7, r13 + add r7, r7, r14 + stw r7, 8*4+(16+17)*4(r1) + lwz r14, 8*4+(18+1)*4(r1) rotrwi r12, r6, 17 rotrwi r13, r6, 19 - add r7, r7, r14 - lwz r14, 8*4+5*4(r1) - - stw r7, 8*4+19*4(r1) + add r11, r11, r8 xor r12, r12, r13 srwi r13, r6, 10 rotrwi r8, r14, 7 xor r12, r12, r13 rotrwi r13, r14, 18 + add r12, r12, r11 xor r8, r8, r13 srwi r13, r14, 3 - add r11, r11, r12 + lwz r11, 8*4+(18+2)*4(r1) xor r8, r8, r13 - rotrwi r12, r7, 17 rotrwi r13, r7, 19 - add r8, r8, r11 - lwz r11, 8*4+6*4(r1) - - stw r8, 8*4+20*4(r1) + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 xor r12, r12, r13 srwi r13, r7, 10 rotrwi r9, r11, 7 xor r12, r12, r13 rotrwi r13, r11, 18 + stw r8, 8*4+(18+16)*4(r1) xor r9, r9, r13 srwi r13, r11, 3 add r14, r14, r12 xor r9, r9, r13 + add r9, r9, r14 + stw r9, 8*4+(18+17)*4(r1) + lwz r14, 8*4+(20+1)*4(r1) rotrwi r12, r8, 17 rotrwi r13, r8, 19 - add r9, r9, r14 - lwz r14, 8*4+7*4(r1) - - stw r9, 8*4+21*4(r1) + add r11, r11, r10 xor r12, r12, r13 srwi r13, r8, 10 rotrwi r10, r14, 7 xor r12, r12, r13 rotrwi r13, r14, 18 + add r12, r12, r11 xor r10, r10, r13 srwi r13, r14, 3 - add r11, r11, r12 + lwz r11, 8*4+(20+2)*4(r1) xor r10, r10, r13 - rotrwi r12, r9, 17 rotrwi r13, r9, 19 - addi r11, r11, 0x0100 + add r10, r10, r12 + + rotrwi r12, r9, 17 add r14, r14, r4 - add r10, r10, r11 - xor r12, r12, r13 srwi r13, r9, 10 - stw r10, 8*4+22*4(r1) - addis r14, r14, 0x1100 + rotrwi r4, r11, 7 xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 8*4+(20+16)*4(r1) + xor r4, r4, r13 + srwi r13, r11, 3 add r14, r14, r12 + xor r4, r4, r13 + add r4, r4, r14 + stw r4, 8*4+(20+17)*4(r1) + lwz r14, 8*4+(22+1)*4(r1) rotrwi r12, r10, 17 rotrwi r13, r10, 19 - addi r4, r14, 0x2000 + add r11, r11, r5 xor r12, r12, r13 srwi r13, r10, 10 - stw r4, 8*4+23*4(r1) - addis r5, r5, 0x8000 + rotrwi r5, r14, 7 xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(22+2)*4(r1) + xor r5, r5, r13 + rotrwi r13, r4, 19 add r5, r5, r12 rotrwi r12, r4, 17 - rotrwi r13, r4, 19 - stw r5, 8*4+24*4(r1) + add r14, r14, r6 xor r12, r12, r13 srwi r13, r4, 10 - rotrwi r11, r5, 17 + rotrwi r6, r11, 7 xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 8*4+(22+16)*4(r1) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 + add r6, r6, r14 + stw r6, 8*4+(22+17)*4(r1) + lwz r14, 8*4+(24+1)*4(r1) + rotrwi r12, r5, 17 rotrwi r13, r5, 19 - xor r11, r11, r13 + add r11, r11, r7 + xor r12, r12, r13 srwi r13, r5, 10 - add r6, r6, r12 - xor r11, r11, r13 - stw r6, 8*4+25*4(r1) - add r7, r7, r11 - - rotrwi r12, r6, 17 + rotrwi r7, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(24+2)*4(r1) + xor r7, r7, r13 rotrwi r13, r6, 19 - stw r7, 8*4+26*4(r1) + add r7, r7, r12 + + rotrwi r12, r6, 17 + add r14, r14, r8 xor r12, r12, r13 srwi r13, r6, 10 - rotrwi r11, r7, 17 + rotrwi r8, r11, 7 xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 8*4+(24+16)*4(r1) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 + add r8, r8, r14 + stw r8, 8*4+(24+17)*4(r1) + lwz r14, 8*4+(26+1)*4(r1) + rotrwi r12, r7, 17 rotrwi r13, r7, 19 - xor r11, r11, r13 + add r11, r11, r9 + xor r12, r12, r13 srwi r13, r7, 10 - add r8, r8, r12 - xor r11, r11, r13 - stw r8, 8*4+27*4(r1) - add r9, r9, r11 - - rotrwi r14, r8, 17 + rotrwi r9, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(26+2)*4(r1) + xor r9, r9, r13 rotrwi r13, r8, 19 + add r9, r9, r12 + + rotrwi r12, r8, 17 + add r14, r14, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r9, 8*4+(26+16)*4(r1) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 8*4+(26+17)*4(r1) + lwz r14, 8*4+(28+1)*4(r1) rotrwi r12, r9, 17 - stw r9, 8*4+28*4(r1) - addis r4, r4, 0x0040 - xor r14, r14, r13 rotrwi r13, r9, 19 + add r11, r11, r4 xor r12, r12, r13 - srwi r13, r8, 10 - xor r14, r14, r13 srwi r13, r9, 10 + rotrwi r4, r14, 7 xor r12, r12, r13 - addi r4, r4, 0x0022 - add r10, r10, r14 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(28+2)*4(r1) + xor r4, r4, r13 + rotrwi r13, r10, 19 add r4, r4, r12 - lwz r11, 8*4+16*4(r1) - - addi r5, r5, 0x0100 - stw r4, 8*4+30*4(r1) - rotrwi r14, r11, 7 - stw r10, 8*4+29*4(r1) - rotrwi r13, r11, 18 + rotrwi r12, r10, 17 - xor r14, r14, r13 - rotrwi r13, r10, 19 + add r14, r14, r5 xor r12, r12, r13 - srwi r13, r11, 3 - xor r14, r14, r13 srwi r13, r10, 10 + rotrwi r5, r11, 7 xor r12, r12, r13 - add r5, r5, r14 - add r5, r5, r12 - stw r5, 8*4+31*4(r1) - - sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5 - sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7 - sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9 - sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4 - sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6 - sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8 - sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10 - sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5 - sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7 - sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9 - sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4 - sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6 - sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8 - sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10 - -#ifdef _AIX - ld r18, T.sha256_h(r2) -#else - lis r18, HI(sha256_h) - addi r18, r18, LO(sha256_h) -#endif - - lwz r14, 8*4+(44+1)*4(r1) + rotrwi r13, r11, 18 + stw r4, 8*4+(28+16)*4(r1) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 8*4+(28+17)*4(r1) + lwz r14, 8*4+(30+1)*4(r1) rotrwi r12, r4, 17 rotrwi r13, r4, 19 - add r15, r11, r6 - rotrwi r6, r14, 7 - rotrwi r11, r14, 18 + add r11, r11, r6 xor r12, r12, r13 - xor r6, r6, r11 - - lwz r8, 4*4(r18) - lwz r9, 5*4(r18) - lwz r10, 6*4(r18) - lwz r11, 7*4(r18) - srwi r13, r4, 10 - srwi r14, r14, 3 + rotrwi r6, r14, 7 xor r12, r12, r13 - xor r6, r6, r14 - add r12, r12, r15 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(30+2)*4(r1) + xor r6, r6, r13 + rotrwi r13, r5, 19 add r6, r6, r12 - stw r6, 8*4+(44+16)*4(r1) - - lwz r4, 0*4(r18) - lwz r5, 1*4(r18) - lwz r6, 2*4(r18) - lwz r7, 3*4(r18) - - sha256_main_quadround 0, r16, r1, 8*4 - sha256_main_quadround 4, r16, r1, 8*4 - sha256_main_quadround 8, r16, r1, 8*4 - sha256_main_quadround 12, r16, r1, 8*4 - sha256_main_quadround 16, r16, r1, 8*4 - sha256_main_quadround 20, r16, r1, 8*4 - sha256_main_quadround 24, r16, r1, 8*4 - sha256_main_quadround 28, r16, r1, 8*4 - sha256_main_quadround 32, r16, r1, 8*4 - sha256_main_quadround 36, r16, r1, 8*4 - sha256_main_quadround 40, r16, r1, 8*4 - sha256_main_quadround 44, r16, r1, 8*4 - sha256_main_quadround 48, r16, r1, 8*4 - sha256_main_quadround 52, r16, r1, 8*4 - sha256_main_round 56, r16, r1, 8*4, r4, r5, r6, r7, r8, r9, r10, r11 - -.macro sha256_main_round_red i, rk, rw, wo, rd, re, rf, rg, rh - lwz r12, \wo+(\i)*4(\rw) - and r15, \rf, \re - andc r14, \rg, \re - add \rh, \rh, \rd - or r14, r14, r15 - lwz r15, (\i)*4(\rk) - rotrwi r13, \re, 5 - add \rh, \rh, r14 - xor r14, \re, r13 - rotrwi r13, \re, 19 - add \rh, \rh, r12 - xor r14, r14, r13 - add \rh, \rh, r15 - rotrwi r13, r14, 6 - add \rh, \rh, r13 -.endm - - sha256_main_round_red 57, r16, r1, 8*4, r6, r11, r8, r9, r10 - sha256_main_round_red 58, r16, r1, 8*4, r5, r10, r11, r8, r9 - sha256_main_round_red 59, r16, r1, 8*4, r4, r9, r10, r11, r8 - lwz r5, 7*4(r18) - sha256_main_round_red 60, r16, r1, 8*4, r7, r8, r9, r10, r11 - - add r11, r11, r5 - stw r11, 7*4(r3) - - ld r13, 2*4(r1) - ld r14, 4*4(r1) - ld r15, 6*4(r1) - ld r16, 72*4(r1) - ld r17, 74*4(r1) - ld r18, 76*4(r1) - addi r1, r1, 80*4 - blr + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 8*4+(30+16)*4(r1) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 8*4+(30+17)*4(r1) + lwz r14, 8*4+(32+1)*4(r1) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(32+2)*4(r1) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 -#ifdef __ALTIVEC__ + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 8*4+(32+16)*4(r1) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 8*4+(32+17)*4(r1) + lwz r14, 8*4+(34+1)*4(r1) + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r11, r11, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r10, r10, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(34+2)*4(r1) + xor r10, r10, r13 + rotrwi r13, r9, 19 + add r10, r10, r12 -#ifdef __APPLE__ - .machine ppc7400 -#endif + rotrwi r12, r9, 17 + add r14, r14, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 8*4+(34+16)*4(r1) + xor r4, r4, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r4, r4, r13 + add r4, r4, r14 + stw r4, 8*4+(34+17)*4(r1) + lwz r14, 8*4+(36+1)*4(r1) + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r11, r11, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(36+2)*4(r1) + xor r5, r5, r13 + rotrwi r13, r4, 19 + add r5, r5, r12 -#ifdef _AIX - .csect .text[RO] -#else - .data -#endif - .align 4 -sha256_4h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + rotrwi r12, r4, 17 + add r14, r14, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 8*4+(36+16)*4(r1) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 + add r6, r6, r14 + stw r6, 8*4+(36+17)*4(r1) + lwz r14, 8*4+(38+1)*4(r1) + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r11, r11, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(38+2)*4(r1) + xor r7, r7, r13 + rotrwi r13, r6, 19 + add r7, r7, r12 - .align 4 -sha256_4k: - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + rotrwi r12, r6, 17 + add r14, r14, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 8*4+(38+16)*4(r1) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 + add r8, r8, r14 + stw r8, 8*4+(38+17)*4(r1) + lwz r14, 8*4+(40+1)*4(r1) + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r11, r11, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(40+2)*4(r1) + xor r9, r9, r13 + rotrwi r13, r8, 19 + add r9, r9, r12 - .align 4 -sha256d_4preext2: - .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 - .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + rotrwi r12, r8, 17 + add r14, r14, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r9, 8*4+(40+16)*4(r1) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 8*4+(40+17)*4(r1) + lwz r14, 8*4+(42+1)*4(r1) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(42+2)*4(r1) + xor r4, r4, r13 + rotrwi r13, r10, 19 + add r4, r4, r12 - .align 4 -br_perm: - .long 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c + rotrwi r12, r10, 17 + add r14, r14, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r4, 8*4+(42+16)*4(r1) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 8*4+(42+17)*4(r1) + lwz r14, 8*4+(44+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(44+2)*4(r1) + xor r6, r6, r13 + rotrwi r13, r5, 19 + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 8*4+(44+16)*4(r1) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 8*4+(44+17)*4(r1) + lwz r14, 8*4+(46+1)*4(r1) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(46+2)*4(r1) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 8*4+(46+16)*4(r1) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 8*4+(46+17)*4(r1) + + lwz r4, 0*4(r3) + lwz r5, 1*4(r3) + lwz r6, 2*4(r3) + lwz r7, 3*4(r3) + lwz r8, 4*4(r3) + lwz r9, 5*4(r3) + lwz r10, 6*4(r3) + lwz r11, 7*4(r3) #ifdef _AIX - .toc -T.sha256_4h: - .tc sha256_4h[TC], sha256_4h -T.sha256_4k: - .tc sha256_4k[TC], sha256_4k -T.sha256d_4preext2: - .tc sha256d_4preext2[TC], sha256d_4preext2 -T.br_perm: - .tc br_perm[TC], br_perm + ld r16, T.sha256_k(r2) +#else + lis r16, HI(sha256_k) + addi r16, r16, LO(sha256_k) #endif + lwz r12, 8*4+(0+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (0+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(0+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (0+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 -.macro sha256_4way_extend_setup - vspltisw v0, 10 - vspltisw v1, -7 - vspltisw v16, 3 - vspltisw v17, 15 - vspltisw v18, 14 - vspltisw v19, 13 -.endm + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(0+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (0+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 -.macro sha256_4way_extend_doubleround i, rw, va, vb, vy, vz - lvx v14, \rw, r7 - vrlw v12, \vy, v17 - vrlw v13, \vy, v19 - vadduwm v11, v11, \va - vxor v12, v12, v13 - vsrw v13, \vy, v0 - vrlw \va, v14, v1 - vxor v12, v12, v13 - vrlw v13, v14, v18 - vadduwm v12, v12, v11 - vxor \va, \va, v13 - vsrw v13, v14, v16 - lvx v11, \rw, r8 - vxor \va, \va, v13 - vrlw v13, \vz, v19 - vadduwm \va, \va, v12 + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(0+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (0+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 - vrlw v12, \vz, v17 - vadduwm v14, v14, \vb - vxor v12, v12, v13 - vsrw v13, \vz, v0 - vrlw \vb, v11, v1 - vxor v12, v12, v13 - vrlw v13, v11, v18 - stvx \va, \rw, r10 - vxor \vb, \vb, v13 - vsrw v13, v11, v16 - vadduwm v14, v14, v12 - vxor \vb, \vb, v13 - vadduwm \vb, \vb, v14 - stvx \vb, \rw, r11 - addi \rw, \rw, 2*16 -.endm + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(4+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (4+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(4+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (4+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 -.macro sha256_4way_main_setup - vspltisw v2, 12 - vspltisw v3, -5 - vspltisw v16, -6 - vspltisw v17, -11 - vspltisw v18, -2 -.endm + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(4+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (4+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 -.macro sha256_4way_main_round i, rk, rw, va, vb, vc, vd, ve, vf, vg, vh - li r6, (\i)*16 - lvx v12, \rw, r6 - vand v13, \vf, \ve - vandc v14, \vg, \ve - lvx v15, \rk, r6 - vor v14, v14, v13 - vrlw v13, \ve, v3 - vadduwm \vh, \vh, v14 - vxor v14, \ve, v13 - vrlw v13, \ve, v19 - vadduwm \vh, \vh, v12 - vxor v14, v14, v13 - vadduwm \vh, \vh, v15 - vrlw v13, v14, v16 - vxor v15, \va, \vb - vadduwm \vh, \vh, v13 + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(4+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (4+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 - vrlw v13, \va, v17 - vand v15, v15, \vc - vxor v12, \va, v13 - vrlw v13, \va, v2 - vand v14, \va, \vb - vxor v12, v12, v13 - vxor v14, v14, v15 - vrlw v13, v12, v18 - vadduwm v15, \vh, v14 - vadduwm \vh, \vh, \vd - vadduwm \vd, v15, v13 -.endm + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(8+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (8+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 -.macro sha256_4way_main_quadround i, rk, rw - sha256_4way_main_round \i+0, \rk, \rw, v4, v5, v6, v7, v8, v9, v10, v11 - sha256_4way_main_round \i+1, \rk, \rw, v7, v4, v5, v6, v11, v8, v9, v10 - sha256_4way_main_round \i+2, \rk, \rw, v6, v7, v4, v5, v10, v11, v8, v9 - sha256_4way_main_round \i+3, \rk, \rw, v5, v6, v7, v4, v9, v10, v11, v8 -.endm + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(8+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (8+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(8+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (8+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 -#ifdef _AIX - .csect .text[PR] -#else - .text -#endif - .align 2 - .globl sha256_init_4way - .globl _sha256_init_4way - .globl .sha256_init_4way -#ifdef __ELF__ - .type sha256_init_4way, %function -#endif -sha256_init_4way: -_sha256_init_4way: -.sha256_init_4way: - mfspr r0, 256 - oris r12, r0, 0xff00 - mtspr 256, r12 - -#ifdef _AIX - ld r4, T.sha256_4h(r2) -#else - lis r4, HI(sha256_4h) + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(8+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (8+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(12+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (12+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(12+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (12+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(12+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (12+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(12+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (12+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(16+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (16+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(16+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (16+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(16+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (16+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(16+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (16+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(20+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (20+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(20+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (20+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(20+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (20+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(20+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (20+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(24+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (24+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(24+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (24+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(24+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (24+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(24+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (24+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(28+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (28+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(28+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (28+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(28+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (28+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(28+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (28+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(32+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (32+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(32+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (32+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(32+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (32+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(32+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (32+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(36+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (36+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(36+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (36+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(36+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (36+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(36+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (36+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(40+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (40+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(40+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (40+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(40+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (40+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(40+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (40+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(44+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (44+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(44+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (44+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(44+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (44+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(44+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (44+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(48+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (48+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(48+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (48+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(48+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (48+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(48+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (48+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(52+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (52+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(52+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (52+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(52+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (52+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(52+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (52+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(56+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (56+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(56+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (56+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(56+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (56+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(56+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (56+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(60+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (60+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(60+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (60+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(60+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (60+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(60+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (60+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + + lwz r12, 0*4(r3) + lwz r13, 1*4(r3) + lwz r14, 2*4(r3) + lwz r15, 3*4(r3) + add r4, r4, r12 + add r5, r5, r13 + add r6, r6, r14 + add r7, r7, r15 + stw r4, 0*4(r3) + stw r5, 1*4(r3) + stw r6, 2*4(r3) + stw r7, 3*4(r3) + lwz r12, 4*4(r3) + lwz r13, 5*4(r3) + lwz r14, 6*4(r3) + lwz r15, 7*4(r3) + add r8, r8, r12 + add r9, r9, r13 + add r10, r10, r14 + add r11, r11, r15 + stw r8, 4*4(r3) + stw r9, 5*4(r3) + stw r10, 6*4(r3) + stw r11, 7*4(r3) + + ld r13, 2*4(r1) + ld r14, 4*4(r1) + ld r15, 6*4(r1) + ld r16, 72*4(r1) + addi r1, r1, 76*4 + blr + + + .align 2 + .globl sha256d_ms + .globl _sha256d_ms + .globl .sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: +.sha256d_ms: + stdu r1, -80*4(r1) + std r13, 2*4(r1) + std r14, 4*4(r1) + std r15, 6*4(r1) + std r16, 72*4(r1) + std r17, 74*4(r1) + std r18, 76*4(r1) + + mr r17, r4 + mr r18, r5 + mr r16, r6 + + lwz r14, 3*4(r17) + lwz r6, 18*4(r17) + lwz r7, 19*4(r17) + + rotrwi r12, r14, 7 + rotrwi r13, r14, 18 + stw r6, 8*4+18*4(r1) + xor r12, r12, r13 + srwi r13, r14, 3 + stw r7, 8*4+19*4(r1) + xor r12, r12, r13 + lwz r8, 20*4(r17) + add r6, r6, r12 + lwz r10, 22*4(r17) + add r7, r7, r14 + stw r6, 18*4(r17) + + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + stw r7, 19*4(r17) + xor r12, r12, r13 + srwi r13, r6, 10 + stw r8, 8*4+20*4(r1) + xor r12, r12, r13 + lwz r4, 23*4(r17) + add r8, r8, r12 + lwz r5, 24*4(r17) + + rotrwi r9, r7, 17 + rotrwi r13, r7, 19 + stw r8, 20*4(r17) + xor r9, r9, r13 + srwi r13, r7, 10 + stw r10, 8*4+21*4(r1) + xor r9, r9, r13 + stw r4, 8*4+22*4(r1) + + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + stw r9, 21*4(r17) + xor r12, r12, r13 + srwi r13, r8, 10 + stw r5, 8*4+23*4(r1) + xor r12, r12, r13 + rotrwi r14, r9, 17 + rotrwi r13, r9, 19 + add r10, r10, r12 + lwz r11, 30*4(r17) + + xor r14, r14, r13 + srwi r13, r9, 10 + stw r10, 22*4(r17) + xor r14, r14, r13 + stw r11, 8*4+24*4(r1) + add r4, r4, r14 + + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + stw r4, 23*4(r17) + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r14, r4, 17 + xor r12, r12, r13 + rotrwi r13, r4, 19 + xor r14, r14, r13 + srwi r13, r4, 10 + add r5, r5, r12 + xor r14, r14, r13 + stw r5, 24*4(r17) + add r6, r6, r14 + + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + stw r6, 25*4(r17) + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r14, r6, 17 + xor r12, r12, r13 + rotrwi r13, r6, 19 + xor r14, r14, r13 + srwi r13, r6, 10 + add r7, r7, r12 + xor r14, r14, r13 + stw r7, 26*4(r17) + add r8, r8, r14 + + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + stw r8, 27*4(r17) + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r14, r8, 17 + xor r12, r12, r13 + rotrwi r13, r8, 19 + xor r14, r14, r13 + srwi r13, r8, 10 + add r9, r9, r12 + xor r14, r14, r13 + stw r9, 28*4(r17) + add r10, r10, r14 + + lwz r14, 31*4(r17) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + stw r10, 29*4(r17) + xor r12, r12, r13 + srwi r13, r9, 10 + stw r14, 8*4+25*4(r1) + xor r12, r12, r13 + add r11, r11, r12 + add r5, r5, r14 + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r4, r4, r11 + + lwz r11, 16*4(r17) + xor r12, r12, r13 + srwi r13, r10, 10 + stw r4, 30*4(r17) + xor r12, r12, r13 + add r5, r5, r12 + stw r5, 31*4(r17) + + lwz r14, 0+(16+1)*4(r17) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 0+(16+2)*4(r17) + xor r6, r6, r13 + rotrwi r13, r5, 19 + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 0+(16+16)*4(r17) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 0+(16+17)*4(r17) + lwz r14, 0+(18+1)*4(r17) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 0+(18+2)*4(r17) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 0+(18+16)*4(r17) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 0+(18+17)*4(r17) + lwz r14, 0+(20+1)*4(r17) + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r11, r11, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r10, r10, r13 + srwi r13, r14, 3 + lwz r11, 0+(20+2)*4(r17) + xor r10, r10, r13 + rotrwi r13, r9, 19 + add r10, r10, r12 + + rotrwi r12, r9, 17 + add r14, r14, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 0+(20+16)*4(r17) + xor r4, r4, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r4, r4, r13 + add r4, r4, r14 + stw r4, 0+(20+17)*4(r17) + lwz r14, 0+(22+1)*4(r17) + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r11, r11, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 0+(22+2)*4(r17) + xor r5, r5, r13 + rotrwi r13, r4, 19 + add r5, r5, r12 + + rotrwi r12, r4, 17 + add r14, r14, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 0+(22+16)*4(r17) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 + add r6, r6, r14 + stw r6, 0+(22+17)*4(r17) + lwz r14, 0+(24+1)*4(r17) + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r11, r11, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 0+(24+2)*4(r17) + xor r7, r7, r13 + rotrwi r13, r6, 19 + add r7, r7, r12 + + rotrwi r12, r6, 17 + add r14, r14, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 0+(24+16)*4(r17) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 + add r8, r8, r14 + stw r8, 0+(24+17)*4(r17) + lwz r14, 0+(26+1)*4(r17) + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r11, r11, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 0+(26+2)*4(r17) + xor r9, r9, r13 + rotrwi r13, r8, 19 + add r9, r9, r12 + + rotrwi r12, r8, 17 + add r14, r14, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r9, 0+(26+16)*4(r17) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 0+(26+17)*4(r17) + lwz r14, 0+(28+1)*4(r17) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 0+(28+2)*4(r17) + xor r4, r4, r13 + rotrwi r13, r10, 19 + add r4, r4, r12 + + rotrwi r12, r10, 17 + add r14, r14, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r4, 0+(28+16)*4(r17) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 0+(28+17)*4(r17) + lwz r14, 0+(30+1)*4(r17) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 0+(30+2)*4(r17) + xor r6, r6, r13 + rotrwi r13, r5, 19 + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 0+(30+16)*4(r17) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 0+(30+17)*4(r17) + lwz r14, 0+(32+1)*4(r17) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 0+(32+2)*4(r17) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 0+(32+16)*4(r17) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 0+(32+17)*4(r17) + lwz r14, 0+(34+1)*4(r17) + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r11, r11, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r10, r10, r13 + srwi r13, r14, 3 + lwz r11, 0+(34+2)*4(r17) + xor r10, r10, r13 + rotrwi r13, r9, 19 + add r10, r10, r12 + + rotrwi r12, r9, 17 + add r14, r14, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 0+(34+16)*4(r17) + xor r4, r4, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r4, r4, r13 + add r4, r4, r14 + stw r4, 0+(34+17)*4(r17) + lwz r14, 0+(36+1)*4(r17) + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r11, r11, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 0+(36+2)*4(r17) + xor r5, r5, r13 + rotrwi r13, r4, 19 + add r5, r5, r12 + + rotrwi r12, r4, 17 + add r14, r14, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 0+(36+16)*4(r17) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 + add r6, r6, r14 + stw r6, 0+(36+17)*4(r17) + lwz r14, 0+(38+1)*4(r17) + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r11, r11, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 0+(38+2)*4(r17) + xor r7, r7, r13 + rotrwi r13, r6, 19 + add r7, r7, r12 + + rotrwi r12, r6, 17 + add r14, r14, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 0+(38+16)*4(r17) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 + add r8, r8, r14 + stw r8, 0+(38+17)*4(r17) + lwz r14, 0+(40+1)*4(r17) + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r11, r11, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 0+(40+2)*4(r17) + xor r9, r9, r13 + rotrwi r13, r8, 19 + add r9, r9, r12 + + rotrwi r12, r8, 17 + add r14, r14, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r9, 0+(40+16)*4(r17) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 0+(40+17)*4(r17) + lwz r14, 0+(42+1)*4(r17) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 0+(42+2)*4(r17) + xor r4, r4, r13 + rotrwi r13, r10, 19 + add r4, r4, r12 + + rotrwi r12, r10, 17 + add r14, r14, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r4, 0+(42+16)*4(r17) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 0+(42+17)*4(r17) + lwz r14, 0+(44+1)*4(r17) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 0+(44+2)*4(r17) + xor r6, r6, r13 + rotrwi r13, r5, 19 + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 0+(44+16)*4(r17) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 0+(44+17)*4(r17) + lwz r14, 0+(46+1)*4(r17) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 0+(46+2)*4(r17) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 0+(46+16)*4(r17) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 0+(46+17)*4(r17) + + lwz r4, 0*4(r16) + lwz r9, 1*4(r16) + lwz r10, 2*4(r16) + lwz r11, 3*4(r16) + lwz r8, 4*4(r16) + lwz r5, 5*4(r16) + lwz r6, 6*4(r16) + lwz r7, 7*4(r16) +#ifdef _AIX + ld r16, T.sha256_k(r2) +#else + lis r16, HI(sha256_k) + addi r16, r16, LO(sha256_k) +#endif + + lwz r12, 0+(3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(4+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (4+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(4+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (4+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(4+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (4+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(4+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (4+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(8+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (8+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(8+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (8+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(8+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (8+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(8+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (8+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(12+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (12+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(12+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (12+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(12+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (12+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(12+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (12+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(16+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (16+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(16+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (16+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(16+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (16+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(16+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (16+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(20+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (20+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(20+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (20+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(20+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (20+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(20+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (20+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(24+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (24+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(24+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (24+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(24+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (24+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(24+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (24+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(28+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (28+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(28+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (28+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(28+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (28+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(28+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (28+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(32+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (32+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(32+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (32+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(32+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (32+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(32+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (32+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(36+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (36+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(36+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (36+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(36+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (36+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(36+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (36+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(40+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (40+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(40+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (40+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(40+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (40+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(40+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (40+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(44+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (44+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(44+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (44+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(44+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (44+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(44+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (44+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(48+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (48+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(48+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (48+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(48+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (48+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(48+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (48+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(52+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (52+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(52+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (52+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(52+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (52+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(52+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (52+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(56+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (56+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(56+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (56+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(56+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (56+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(56+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (56+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 0+(60+0)*4(r17) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (60+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 0+(60+1)*4(r17) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (60+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 0+(60+2)*4(r17) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (60+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 0+(60+3)*4(r17) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (60+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + + lwz r12, 0*4(r18) + lwz r13, 1*4(r18) + lwz r14, 2*4(r18) + lwz r15, 3*4(r18) + add r4, r4, r12 + add r5, r5, r13 + add r6, r6, r14 + add r7, r7, r15 + stw r4, 8*4+0*4(r1) + stw r5, 8*4+1*4(r1) + stw r6, 8*4+2*4(r1) + stw r7, 8*4+3*4(r1) + lwz r12, 4*4(r18) + lwz r13, 5*4(r18) + lwz r14, 6*4(r18) + lwz r15, 7*4(r18) + add r8, r8, r12 + add r9, r9, r13 + add r10, r10, r14 + add r11, r11, r15 + stw r8, 8*4+4*4(r1) + stw r9, 8*4+5*4(r1) + stw r10, 8*4+6*4(r1) + stw r11, 8*4+7*4(r1) + + lwz r4, 8*4+18*4(r1) + lwz r5, 8*4+19*4(r1) + lwz r6, 8*4+20*4(r1) + lwz r7, 8*4+21*4(r1) + lwz r8, 8*4+22*4(r1) + lwz r9, 8*4+23*4(r1) + lwz r10, 8*4+24*4(r1) + lwz r11, 8*4+25*4(r1) + stw r4, 18*4(r17) + stw r5, 19*4(r17) + stw r6, 20*4(r17) + stw r7, 22*4(r17) + stw r8, 23*4(r17) + stw r9, 24*4(r17) + stw r10, 30*4(r17) + stw r11, 31*4(r17) + + lis r8, 0x8000 + li r9, 0 + li r10, 0x0100 + + lwz r14, 8*4+1*4(r1) + lwz r4, 8*4+0*4(r1) + + lwz r11, 8*4+2*4(r1) + rotrwi r12, r14, 7 + rotrwi r13, r14, 18 + + stw r8, 8*4+8*4(r1) + stw r9, 8*4+9*4(r1) + stw r9, 8*4+10*4(r1) + stw r9, 8*4+11*4(r1) + stw r9, 8*4+12*4(r1) + stw r9, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + + xor r12, r12, r13 + srwi r13, r14, 3 + addis r5, r14, 0x00a0 + xor r12, r12, r13 + rotrwi r14, r11, 7 + rotrwi r13, r11, 18 + add r4, r4, r12 + xor r14, r14, r13 + srwi r13, r11, 3 + stw r4, 8*4+16*4(r1) + xor r14, r14, r13 + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r5, r5, r14 + lwz r14, 8*4+3*4(r1) + + stw r5, 8*4+17*4(r1) + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r6, r6, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r6, r6, r13 + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r6, r6, r11 + lwz r11, 8*4+4*4(r1) + + stw r6, 8*4+18*4(r1) + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r7, r7, r14 + lwz r14, 8*4+5*4(r1) + + stw r7, 8*4+19*4(r1) + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r8, r8, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r8, r8, r13 + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r8, r8, r11 + lwz r11, 8*4+6*4(r1) + + stw r8, 8*4+20*4(r1) + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r9, r9, r14 + lwz r14, 8*4+7*4(r1) + + stw r9, 8*4+21*4(r1) + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r10, r10, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r10, r10, r13 + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + addi r11, r11, 0x0100 + add r14, r14, r4 + add r10, r10, r11 + + xor r12, r12, r13 + srwi r13, r9, 10 + stw r10, 8*4+22*4(r1) + addis r14, r14, 0x1100 + xor r12, r12, r13 + add r14, r14, r12 + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + addi r4, r14, 0x2000 + xor r12, r12, r13 + srwi r13, r10, 10 + stw r4, 8*4+23*4(r1) + addis r5, r5, 0x8000 + xor r12, r12, r13 + add r5, r5, r12 + + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + stw r5, 8*4+24*4(r1) + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r11, r5, 17 + xor r12, r12, r13 + rotrwi r13, r5, 19 + xor r11, r11, r13 + srwi r13, r5, 10 + add r6, r6, r12 + xor r11, r11, r13 + stw r6, 8*4+25*4(r1) + add r7, r7, r11 + + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + stw r7, 8*4+26*4(r1) + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r11, r7, 17 + xor r12, r12, r13 + rotrwi r13, r7, 19 + xor r11, r11, r13 + srwi r13, r7, 10 + add r8, r8, r12 + xor r11, r11, r13 + stw r8, 8*4+27*4(r1) + add r9, r9, r11 + + rotrwi r14, r8, 17 + rotrwi r13, r8, 19 + rotrwi r12, r9, 17 + stw r9, 8*4+28*4(r1) + addis r4, r4, 0x0040 + xor r14, r14, r13 + rotrwi r13, r9, 19 + xor r12, r12, r13 + srwi r13, r8, 10 + xor r14, r14, r13 + srwi r13, r9, 10 + xor r12, r12, r13 + addi r4, r4, 0x0022 + add r10, r10, r14 + add r4, r4, r12 + lwz r11, 8*4+16*4(r1) + + addi r5, r5, 0x0100 + stw r4, 8*4+30*4(r1) + rotrwi r14, r11, 7 + stw r10, 8*4+29*4(r1) + rotrwi r13, r11, 18 + rotrwi r12, r10, 17 + xor r14, r14, r13 + rotrwi r13, r10, 19 + xor r12, r12, r13 + srwi r13, r11, 3 + xor r14, r14, r13 + srwi r13, r10, 10 + xor r12, r12, r13 + add r5, r5, r14 + add r5, r5, r12 + stw r5, 8*4+31*4(r1) + + lwz r14, 8*4+(16+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(16+2)*4(r1) + xor r6, r6, r13 + rotrwi r13, r5, 19 + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 8*4+(16+16)*4(r1) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 8*4+(16+17)*4(r1) + lwz r14, 8*4+(18+1)*4(r1) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(18+2)*4(r1) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 8*4+(18+16)*4(r1) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 8*4+(18+17)*4(r1) + lwz r14, 8*4+(20+1)*4(r1) + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r11, r11, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r10, r10, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(20+2)*4(r1) + xor r10, r10, r13 + rotrwi r13, r9, 19 + add r10, r10, r12 + + rotrwi r12, r9, 17 + add r14, r14, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 8*4+(20+16)*4(r1) + xor r4, r4, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r4, r4, r13 + add r4, r4, r14 + stw r4, 8*4+(20+17)*4(r1) + lwz r14, 8*4+(22+1)*4(r1) + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r11, r11, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(22+2)*4(r1) + xor r5, r5, r13 + rotrwi r13, r4, 19 + add r5, r5, r12 + + rotrwi r12, r4, 17 + add r14, r14, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 8*4+(22+16)*4(r1) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 + add r6, r6, r14 + stw r6, 8*4+(22+17)*4(r1) + lwz r14, 8*4+(24+1)*4(r1) + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r11, r11, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(24+2)*4(r1) + xor r7, r7, r13 + rotrwi r13, r6, 19 + add r7, r7, r12 + + rotrwi r12, r6, 17 + add r14, r14, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 8*4+(24+16)*4(r1) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 + add r8, r8, r14 + stw r8, 8*4+(24+17)*4(r1) + lwz r14, 8*4+(26+1)*4(r1) + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r11, r11, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(26+2)*4(r1) + xor r9, r9, r13 + rotrwi r13, r8, 19 + add r9, r9, r12 + + rotrwi r12, r8, 17 + add r14, r14, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r9, 8*4+(26+16)*4(r1) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 8*4+(26+17)*4(r1) + lwz r14, 8*4+(28+1)*4(r1) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(28+2)*4(r1) + xor r4, r4, r13 + rotrwi r13, r10, 19 + add r4, r4, r12 + + rotrwi r12, r10, 17 + add r14, r14, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r4, 8*4+(28+16)*4(r1) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 8*4+(28+17)*4(r1) + lwz r14, 8*4+(30+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r11, r11, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r6, r6, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(30+2)*4(r1) + xor r6, r6, r13 + rotrwi r13, r5, 19 + add r6, r6, r12 + + rotrwi r12, r5, 17 + add r14, r14, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r6, 8*4+(30+16)*4(r1) + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + add r7, r7, r14 + stw r7, 8*4+(30+17)*4(r1) + lwz r14, 8*4+(32+1)*4(r1) + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r11, r11, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r8, r8, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(32+2)*4(r1) + xor r8, r8, r13 + rotrwi r13, r7, 19 + add r8, r8, r12 + + rotrwi r12, r7, 17 + add r14, r14, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r8, 8*4+(32+16)*4(r1) + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + add r9, r9, r14 + stw r9, 8*4+(32+17)*4(r1) + lwz r14, 8*4+(34+1)*4(r1) + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r11, r11, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r10, r10, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(34+2)*4(r1) + xor r10, r10, r13 + rotrwi r13, r9, 19 + add r10, r10, r12 + + rotrwi r12, r9, 17 + add r14, r14, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r10, 8*4+(34+16)*4(r1) + xor r4, r4, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r4, r4, r13 + add r4, r4, r14 + stw r4, 8*4+(34+17)*4(r1) + lwz r14, 8*4+(36+1)*4(r1) + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r11, r11, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r5, r5, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(36+2)*4(r1) + xor r5, r5, r13 + rotrwi r13, r4, 19 + add r5, r5, r12 + + rotrwi r12, r4, 17 + add r14, r14, r6 + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r5, 8*4+(36+16)*4(r1) + xor r6, r6, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r6, r6, r13 + add r6, r6, r14 + stw r6, 8*4+(36+17)*4(r1) + lwz r14, 8*4+(38+1)*4(r1) + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r11, r11, r7 + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r7, r7, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(38+2)*4(r1) + xor r7, r7, r13 + rotrwi r13, r6, 19 + add r7, r7, r12 + + rotrwi r12, r6, 17 + add r14, r14, r8 + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r7, 8*4+(38+16)*4(r1) + xor r8, r8, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r8, r8, r13 + add r8, r8, r14 + stw r8, 8*4+(38+17)*4(r1) + lwz r14, 8*4+(40+1)*4(r1) + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r11, r11, r9 + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r9, r9, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(40+2)*4(r1) + xor r9, r9, r13 + rotrwi r13, r8, 19 + add r9, r9, r12 + + rotrwi r12, r8, 17 + add r14, r14, r10 + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r9, 8*4+(40+16)*4(r1) + xor r10, r10, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r10, r10, r13 + add r10, r10, r14 + stw r10, 8*4+(40+17)*4(r1) + lwz r14, 8*4+(42+1)*4(r1) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + add r11, r11, r4 + xor r12, r12, r13 + srwi r13, r9, 10 + rotrwi r4, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor r4, r4, r13 + srwi r13, r14, 3 + lwz r11, 8*4+(42+2)*4(r1) + xor r4, r4, r13 + rotrwi r13, r10, 19 + add r4, r4, r12 + + rotrwi r12, r10, 17 + add r14, r14, r5 + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r5, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw r4, 8*4+(42+16)*4(r1) + xor r5, r5, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r5, r5, r13 + add r5, r5, r14 + stw r5, 8*4+(42+17)*4(r1) + +#ifdef _AIX + ld r18, T.sha256_h(r2) +#else + lis r18, HI(sha256_h) + addi r18, r18, LO(sha256_h) +#endif + + lwz r14, 8*4+(44+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r15, r11, r6 + rotrwi r6, r14, 7 + rotrwi r11, r14, 18 + xor r12, r12, r13 + xor r6, r6, r11 + + lwz r8, 4*4(r18) + lwz r9, 5*4(r18) + lwz r10, 6*4(r18) + lwz r11, 7*4(r18) + + srwi r13, r4, 10 + srwi r14, r14, 3 + xor r12, r12, r13 + xor r6, r6, r14 + add r12, r12, r15 + add r6, r6, r12 + stw r6, 8*4+(44+16)*4(r1) + + lwz r4, 0*4(r18) + lwz r5, 1*4(r18) + lwz r6, 2*4(r18) + lwz r7, 3*4(r18) + + lwz r12, 8*4+(0+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (0+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(0+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (0+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(0+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (0+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(0+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (0+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(4+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (4+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(4+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (4+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(4+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (4+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(4+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (4+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(8+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (8+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(8+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (8+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(8+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (8+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(8+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (8+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(12+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (12+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(12+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (12+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(12+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (12+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(12+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (12+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(16+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (16+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(16+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (16+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(16+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (16+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(16+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (16+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(20+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (20+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(20+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (20+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(20+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (20+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(20+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (20+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(24+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (24+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(24+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (24+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(24+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (24+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(24+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (24+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(28+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (28+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(28+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (28+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(28+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (28+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(28+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (28+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(32+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (32+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(32+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (32+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(32+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (32+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(32+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (32+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(36+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (36+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(36+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (36+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(36+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (36+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(36+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (36+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(40+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (40+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(40+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (40+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(40+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (40+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(40+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (40+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(44+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (44+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(44+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (44+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(44+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (44+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(44+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (44+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(48+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (48+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(48+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (48+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(48+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (48+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(48+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (48+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(52+0)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (52+0)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + lwz r12, 8*4+(52+1)*4(r1) + and r13, r8, r11 + andc r14, r9, r11 + lwz r15, (52+1)*4(r16) + or r14, r14, r13 + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + xor r15, r7, r4 + add r10, r10, r13 + + rotrwi r13, r7, 11 + and r15, r15, r5 + xor r12, r7, r13 + rotrwi r13, r7, 20 + and r14, r7, r4 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r10, r14 + add r10, r10, r6 + add r6, r15, r13 + lwz r12, 8*4+(52+2)*4(r1) + and r13, r11, r10 + andc r14, r8, r10 + lwz r15, (52+2)*4(r16) + or r14, r14, r13 + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + xor r15, r6, r7 + add r9, r9, r13 + + rotrwi r13, r6, 11 + and r15, r15, r4 + xor r12, r6, r13 + rotrwi r13, r6, 20 + and r14, r6, r7 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r9, r14 + add r9, r9, r5 + add r5, r15, r13 + lwz r12, 8*4+(52+3)*4(r1) + and r13, r10, r9 + andc r14, r11, r9 + lwz r15, (52+3)*4(r16) + or r14, r14, r13 + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + xor r15, r5, r6 + add r8, r8, r13 + + rotrwi r13, r5, 11 + and r15, r15, r7 + xor r12, r5, r13 + rotrwi r13, r5, 20 + and r14, r5, r6 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r8, r14 + add r8, r8, r4 + add r4, r15, r13 + lwz r12, 8*4+(56)*4(r1) + and r13, r9, r8 + andc r14, r10, r8 + lwz r15, (56)*4(r16) + or r14, r14, r13 + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + xor r15, r4, r5 + add r11, r11, r13 + + rotrwi r13, r4, 11 + and r15, r15, r6 + xor r12, r4, r13 + rotrwi r13, r4, 20 + and r14, r4, r5 + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, r11, r14 + add r11, r11, r7 + add r7, r15, r13 + + + lwz r12, 8*4+(57)*4(r1) + and r15, r8, r11 + andc r14, r9, r11 + add r10, r10, r6 + or r14, r14, r15 + lwz r15, (57)*4(r16) + rotrwi r13, r11, 5 + add r10, r10, r14 + xor r14, r11, r13 + rotrwi r13, r11, 19 + add r10, r10, r12 + xor r14, r14, r13 + add r10, r10, r15 + rotrwi r13, r14, 6 + add r10, r10, r13 + lwz r12, 8*4+(58)*4(r1) + and r15, r11, r10 + andc r14, r8, r10 + add r9, r9, r5 + or r14, r14, r15 + lwz r15, (58)*4(r16) + rotrwi r13, r10, 5 + add r9, r9, r14 + xor r14, r10, r13 + rotrwi r13, r10, 19 + add r9, r9, r12 + xor r14, r14, r13 + add r9, r9, r15 + rotrwi r13, r14, 6 + add r9, r9, r13 + lwz r12, 8*4+(59)*4(r1) + and r15, r10, r9 + andc r14, r11, r9 + add r8, r8, r4 + or r14, r14, r15 + lwz r15, (59)*4(r16) + rotrwi r13, r9, 5 + add r8, r8, r14 + xor r14, r9, r13 + rotrwi r13, r9, 19 + add r8, r8, r12 + xor r14, r14, r13 + add r8, r8, r15 + rotrwi r13, r14, 6 + add r8, r8, r13 + lwz r5, 7*4(r18) + lwz r12, 8*4+(60)*4(r1) + and r15, r9, r8 + andc r14, r10, r8 + add r11, r11, r7 + or r14, r14, r15 + lwz r15, (60)*4(r16) + rotrwi r13, r8, 5 + add r11, r11, r14 + xor r14, r8, r13 + rotrwi r13, r8, 19 + add r11, r11, r12 + xor r14, r14, r13 + add r11, r11, r15 + rotrwi r13, r14, 6 + add r11, r11, r13 + + add r11, r11, r5 + stw r11, 7*4(r3) + + ld r13, 2*4(r1) + ld r14, 4*4(r1) + ld r15, 6*4(r1) + ld r16, 72*4(r1) + ld r17, 74*4(r1) + ld r18, 76*4(r1) + addi r1, r1, 80*4 + blr + + +#ifdef __ALTIVEC__ + +#ifdef __APPLE__ + .machine ppc7400 +#endif + +#ifdef _AIX + .csect .text[RO] +#else + .data +#endif + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .align 4 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .align 4 +sha256d_4preext2: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + .align 4 +br_perm: + .long 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c + +#ifdef _AIX + .toc +T.sha256_4h: + .tc sha256_4h[TC], sha256_4h +T.sha256_4k: + .tc sha256_4k[TC], sha256_4k +T.sha256d_4preext2: + .tc sha256d_4preext2[TC], sha256d_4preext2 +T.br_perm: + .tc br_perm[TC], br_perm +#endif + + + + + + + + + +#ifdef _AIX + .csect .text[PR] +#else + .text +#endif + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way + .globl .sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: +.sha256_init_4way: + mfspr r0, 256 + oris r12, r0, 0xff00 + mtspr 256, r12 + +#ifdef _AIX + ld r4, T.sha256_4h(r2) +#else + lis r4, HI(sha256_4h) addi r4, r4, LO(sha256_4h) #endif - li r5, 1*16 - li r6, 2*16 - li r7, 3*16 - li r8, 4*16 - li r9, 5*16 - li r10, 6*16 - li r11, 7*16 - lvx v0, 0, r4 - lvx v1, r4, r5 - lvx v2, r4, r6 - lvx v3, r4, r7 - lvx v4, r4, r8 - lvx v5, r4, r9 + li r5, 1*16 + li r6, 2*16 + li r7, 3*16 + li r8, 4*16 + li r9, 5*16 + li r10, 6*16 + li r11, 7*16 + lvx v0, 0, r4 + lvx v1, r4, r5 + lvx v2, r4, r6 + lvx v3, r4, r7 + lvx v4, r4, r8 + lvx v5, r4, r9 + lvx v6, r4, r10 + lvx v7, r4, r11 + stvx v0, 0, r3 + stvx v1, r3, r5 + stvx v2, r3, r6 + stvx v3, r3, r7 + stvx v4, r3, r8 + stvx v5, r3, r9 + stvx v6, r3, r10 + stvx v7, r3, r11 + + mtspr 256, r0 + blr + + + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way + .globl .sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: +.sha256_transform_4way: + mfspr r0, 256 + oris r12, r0, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 + + andi. r6, r1, 15 + cmpwi 0, r5, 0 + li r7, -(4*4+64*16) + subf r6, r6, r7 + stdux r1, r1, r6 + + li r7, 1*16 + li r8, 2*16 + li r9, 3*16 + li r10, 4*16 + li r11, 5*16 + li r12, 6*16 + li r6, 7*16 + + bne 0, sha256_transform_4way_swap + + lvx v11, 0, r4 + lvx v1, r4, r7 + lvx v2, r4, r8 + lvx v3, r4, r9 + lvx v4, r4, r10 + lvx v5, r4, r11 + lvx v6, r4, r12 + lvx v7, r4, r6 + addi r5, r1, 4*4 + stvx v11, 0, r5 + stvx v1, r5, r7 + stvx v2, r5, r8 + stvx v3, r5, r9 + stvx v4, r5, r10 + stvx v5, r5, r11 + stvx v6, r5, r12 + stvx v7, r5, r6 + addi r4, r4, 8*16 + lvx v0, 0, r4 + lvx v4, r4, r7 + lvx v5, r4, r8 + lvx v6, r4, r9 + lvx v7, r4, r10 + lvx v8, r4, r11 + lvx v9, r4, r12 + lvx v10, r4, r6 + addi r4, r1, 4*4+8*16 + stvx v0, 0, r4 + stvx v4, r4, r7 + stvx v5, r4, r8 + stvx v6, r4, r9 + stvx v7, r4, r10 + stvx v8, r4, r11 + stvx v9, r4, r12 + stvx v10, r4, r6 + b sha256_transform_4way_extend + +sha256_transform_4way_swap: +#ifdef _AIX + ld r5, T.br_perm(r2) +#else + lis r5, HI(br_perm) + addi r5, r5, LO(br_perm) +#endif + lvx v19, 0, r5 + + lvx v11, 0, r4 + lvx v1, r4, r7 + lvx v2, r4, r8 + lvx v3, r4, r9 + lvx v4, r4, r10 + lvx v5, r4, r11 + lvx v6, r4, r12 + lvx v7, r4, r6 + vperm v11, v11, v11, v19 + vperm v1, v1, v1, v19 + vperm v2, v2, v2, v19 + vperm v3, v3, v3, v19 + vperm v4, v4, v4, v19 + vperm v5, v5, v5, v19 + vperm v6, v6, v6, v19 + vperm v7, v7, v7, v19 + addi r5, r1, 4*4 + stvx v11, 0, r5 + stvx v1, r5, r7 + stvx v2, r5, r8 + stvx v3, r5, r9 + stvx v4, r5, r10 + stvx v5, r5, r11 + stvx v6, r5, r12 + stvx v7, r5, r6 + addi r4, r4, 8*16 + lvx v0, 0, r4 + lvx v4, r4, r7 + lvx v5, r4, r8 + lvx v6, r4, r9 + lvx v7, r4, r10 + lvx v8, r4, r11 + lvx v9, r4, r12 + lvx v10, r4, r6 + vperm v0, v0, v0, v19 + vperm v4, v4, v4, v19 + vperm v5, v5, v5, v19 + vperm v6, v6, v6, v19 + vperm v7, v7, v7, v19 + vperm v8, v8, v8, v19 + vperm v9, v9, v9, v19 + vperm v10, v10, v10, v19 + addi r4, r1, 4*4+8*16 + stvx v0, 0, r4 + stvx v4, r4, r7 + stvx v5, r4, r8 + stvx v6, r4, r9 + stvx v7, r4, r10 + stvx v8, r4, r11 + stvx v9, r4, r12 + stvx v10, r4, r6 + +sha256_transform_4way_extend: + li r10, 16*16 + li r11, 17*16 + vspltisw v0, 10 + vspltisw v1, -7 + vspltisw v16, 3 + vspltisw v17, 15 + vspltisw v18, 14 + vspltisw v19, 13 + lvx v14, r5, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r5, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r5, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r5, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r5, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r5, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v7, v7, v13 + vrlw v13, v6, v19 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v7, r5, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 + stvx v8, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v11, v11, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v9, v9, v13 + vrlw v13, v8, v19 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v9, r5, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r5, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r5, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r5, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r5, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r5, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v7, v7, v13 + vrlw v13, v6, v19 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v7, r5, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 + stvx v8, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v11, v11, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v9, v9, v13 + vrlw v13, v8, v19 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v9, r5, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r5, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r5, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r5, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r5, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r5, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v7, v7, v13 + vrlw v13, v6, v19 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v7, r5, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 + stvx v8, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v11, v11, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v9, v9, v13 + vrlw v13, v8, v19 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v9, r5, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r5, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r5, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r5, r11 + addi r5, r5, 2*16 + lvx v14, r5, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r5, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r5, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r5, r11 + addi r5, r5, 2*16 + + addi r11, r3, 4*16 + lvx v4, 0, r3 + lvx v5, r3, r7 + lvx v6, r3, r8 + lvx v7, r3, r9 + lvx v8, 0, r11 + lvx v9, r11, r7 + lvx v10, r11, r8 + lvx v11, r11, r9 +#ifdef _AIX + ld r12, T.sha256_4k(r2) +#else + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) +#endif + addi r5, r1, 4*4 + vspltisw v2, 12 + vspltisw v3, -5 + vspltisw v16, -6 + vspltisw v17, -11 + vspltisw v18, -2 + li r6, (0+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (0+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (0+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (0+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (4+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (4+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (4+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (4+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (8+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (8+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (8+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (8+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (12+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (12+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (12+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (12+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (16+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (16+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (16+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (16+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (20+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (20+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (20+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (20+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (24+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (24+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (24+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (24+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (28+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (28+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (28+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (28+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (32+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (32+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (32+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (32+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (36+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (36+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (36+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (36+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (40+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (40+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (40+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (40+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (44+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (44+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (44+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (44+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (48+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (48+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (48+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (48+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (52+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (52+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (52+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (52+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (56+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (56+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (56+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (56+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (60+0)*16 + lvx v12, r5, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (60+1)*16 + lvx v12, r5, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (60+2)*16 + lvx v12, r5, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (60+3)*16 + lvx v12, r5, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + + lvx v12, 0, r3 + lvx v13, r3, r7 + lvx v14, r3, r8 + lvx v15, r3, r9 + lvx v16, 0, r11 + lvx v17, r11, r7 + lvx v18, r11, r8 + lvx v19, r11, r9 + vadduwm v4, v4, v12 + vadduwm v5, v5, v13 + vadduwm v6, v6, v14 + vadduwm v7, v7, v15 + vadduwm v8, v8, v16 + vadduwm v9, v9, v17 + vadduwm v10, v10, v18 + vadduwm v11, v11, v19 + stvx v4, 0, r3 + stvx v5, r3, r7 + stvx v6, r3, r8 + stvx v7, r3, r9 + stvx v8, 0, r11 + stvx v9, r11, r7 + stvx v10, r11, r8 + stvx v11, r11, r9 + + ld r1, 0(r1) + mtspr 256, r0 + blr + + + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way + .globl .sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: +.sha256d_ms_4way: + mfspr r0, 256 + oris r12, r0, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 + + andi. r12, r1, 15 + li r11, -(4*4+64*16) + subf r12, r12, r11 + stdux r1, r1, r12 + + li r7, 1*16 + li r8, 2*16 + li r9, 3*16 + li r10, 16*16 + li r11, 17*16 + + vspltisw v0, 10 + vspltisw v1, -7 + vspltisw v16, 3 + vspltisw v17, 15 + vspltisw v18, 14 + vspltisw v19, 13 + + addi r4, r4, 2*16 + addi r12, r1, 4*4+18*16 + lvx v14, r4, r7 lvx v6, r4, r10 lvx v7, r4, r11 - stvx v0, 0, r3 - stvx v1, r3, r5 - stvx v2, r3, r6 - stvx v3, r3, r7 - stvx v4, r3, r8 - stvx v5, r3, r9 - stvx v6, r3, r10 - stvx v7, r3, r11 - mtspr 256, r0 - blr + vrlw v12, v14, v1 + vrlw v13, v14, v18 + stvx v6, 0, r12 + vxor v12, v12, v13 + vsrw v13, v14, v16 + stvx v7, r12, r7 + vxor v12, v12, v13 + vadduwm v6, v6, v12 + vadduwm v7, v7, v14 + stvx v6, r4, r10 + + vrlw v12, v6, v17 + vrlw v13, v6, v19 + stvx v7, r4, r11 + addi r4, r4, 18*16 + lvx v8, 0, r4 + vxor v12, v12, v13 + vsrw v13, v6, v0 + stvx v8, r12, r8 + vxor v12, v12, v13 + vadduwm v8, v8, v12 + + vrlw v9, v7, v17 + vrlw v13, v7, v19 + stvx v8, 0, r4 + vxor v9, v9, v13 + vsrw v13, v7, v0 + vxor v9, v9, v13 + + vrlw v12, v8, v17 + vrlw v13, v8, v19 + stvx v9, r4, r7 + vxor v12, v12, v13 + vsrw v13, v8, v0 + lvx v10, r4, r8 + lvx v4, r4, r9 + vxor v12, v12, v13 + stvx v10, r12, r9 + addi r12, r12, 4*16 + stvx v4, 0, r12 + vrlw v14, v9, v17 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vxor v14, v14, v13 + vsrw v13, v9, v0 + stvx v10, r4, r8 + vxor v14, v14, v13 + vadduwm v4, v4, v14 + + vrlw v12, v10, v17 + vrlw v13, v10, v19 + stvx v4, r4, r9 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v14, v4, v17 + vxor v12, v12, v13 + vrlw v13, v4, v19 + addi r4, r4, 4*16 + lvx v5, 0, r4 + vxor v14, v14, v13 + stvx v5, r12, r7 + vsrw v13, v4, v0 + vadduwm v5, v5, v12 + vxor v14, v14, v13 + stvx v5, 0, r4 + vadduwm v6, v6, v14 + + vrlw v12, v5, v17 + vrlw v13, v5, v19 + stvx v6, r4, r7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v14, v6, v17 + vxor v12, v12, v13 + vrlw v13, v6, v19 + vxor v14, v14, v13 + vsrw v13, v6, v0 + vadduwm v7, v7, v12 + vxor v14, v14, v13 + stvx v7, r4, r8 + vadduwm v8, v8, v14 + + vrlw v12, v7, v17 + vrlw v13, v7, v19 + stvx v8, r4, r9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v14, v8, v17 + vxor v12, v12, v13 + vrlw v13, v8, v19 + vxor v14, v14, v13 + vsrw v13, v8, v0 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + addi r4, r4, 4*16 + stvx v9, 0, r4 + vadduwm v10, v10, v14 + + vrlw v12, v9, v17 + vrlw v13, v9, v19 + stvx v10, r4, r7 + vxor v12, v12, v13 + vsrw v13, v9, v0 + lvx v11, r4, r8 + lvx v14, r4, r9 + stvx v11, r12, r8 + stvx v14, r12, r9 + vxor v12, v12, v13 + vadduwm v11, v11, v12 + vadduwm v5, v5, v14 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v4, v4, v11 + + vxor v12, v12, v13 + vsrw v13, v10, v0 + stvx v4, r4, r8 + vxor v12, v12, v13 + vadduwm v5, v5, v12 + stvx v5, r4, r9 + addi r4, r4, -12*16 + lvx v11, 0, r4 + + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r4, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r4, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r4, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r4, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v7, v7, v13 + vrlw v13, v6, v19 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v7, r4, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 + stvx v8, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v11, v11, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v9, v9, v13 + vrlw v13, v8, v19 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v9, r4, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r4, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r4, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r4, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r4, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r4, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v7, v7, v13 + vrlw v13, v6, v19 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v7, r4, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 + stvx v8, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v11, v11, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v9, v9, v13 + vrlw v13, v8, v19 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v9, r4, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r4, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r4, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r4, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r4, r11 + addi r4, r4, 2*16 + addi r4, r4, -48*16 + + lvx v4, 0, r6 + lvx v9, r6, r7 + lvx v10, r6, r8 + lvx v11, r6, r9 + addi r12, r6, 4*16 + lvx v8, 0, r12 + lvx v5, r12, r7 + lvx v6, r12, r8 + lvx v7, r12, r9 +#ifdef _AIX + ld r12, T.sha256_4k(r2) +#else + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) +#endif + vspltisw v2, 12 + vspltisw v3, -5 + vspltisw v16, -6 + vspltisw v17, -11 + vspltisw v18, -2 + li r6, (3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (4+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (4+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (4+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (4+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (8+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (8+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (8+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (8+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (12+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (12+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (12+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (12+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (16+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (16+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (16+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (16+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (20+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (20+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (20+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (20+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (24+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (24+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (24+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (24+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (28+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (28+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (28+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (28+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (32+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (32+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (32+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (32+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (36+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (36+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (36+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (36+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (40+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (40+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (40+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (40+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (44+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (44+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (44+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (44+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (48+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (48+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (48+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (48+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (52+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (52+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (52+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (52+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (56+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (56+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (56+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (56+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (60+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 - .align 2 - .globl sha256_transform_4way - .globl _sha256_transform_4way - .globl .sha256_transform_4way -#ifdef __ELF__ - .type sha256_transform_4way, %function + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (60+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (60+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (60+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + + lvx v12, 0, r5 + lvx v13, r5, r7 + lvx v14, r5, r8 + lvx v15, r5, r9 + addi r12, r5, 4*16 + lvx v16, 0, r12 + lvx v17, r12, r7 + lvx v18, r12, r8 + lvx v19, r12, r9 + vadduwm v4, v4, v12 + vadduwm v5, v5, v13 + vadduwm v6, v6, v14 + vadduwm v7, v7, v15 + vadduwm v8, v8, v16 + vadduwm v9, v9, v17 + vadduwm v10, v10, v18 + vadduwm v11, v11, v19 + addi r12, r1, 4*4 + stvx v4, 0, r12 + stvx v5, r12, r7 + stvx v6, r12, r8 + stvx v7, r12, r9 + addi r12, r12, 4*16 + stvx v8, 0, r12 + stvx v9, r12, r7 + stvx v10, r12, r8 + stvx v11, r12, r9 + + addi r12, r1, 4*4+18*16 + lvx v4, 0, r12 + lvx v5, r12, r7 + lvx v6, r12, r8 + lvx v7, r12, r9 + addi r12, r12, 4*16 + lvx v8, 0, r12 + lvx v9, r12, r7 + lvx v10, r12, r8 + lvx v11, r12, r9 + addi r12, r4, 18*16 + stvx v4, 0, r12 + stvx v5, r12, r7 + stvx v6, r12, r8 + addi r12, r4, 22*16 + stvx v7, 0, r12 + stvx v8, r12, r7 + stvx v9, r12, r8 + addi r12, r4, 30*16 + stvx v10, 0, r12 + stvx v11, r12, r7 + + addi r4, r1, 4*4 + + vspltisw v0, 10 + vspltisw v1, -7 + vspltisw v16, 3 + vspltisw v17, 15 + vspltisw v18, 14 + vspltisw v19, 13 + +#ifdef _AIX + ld r12, T.sha256d_4preext2(r2) +#else + lis r12, HI(sha256d_4preext2) + addi r12, r12, LO(sha256d_4preext2) #endif -sha256_transform_4way: -_sha256_transform_4way: -.sha256_transform_4way: - mfspr r0, 256 - oris r12, r0, 0xffff - ori r12, r12, 0xf000 - mtspr 256, r12 + lvx v2, 0, r12 - andi. r6, r1, 15 - cmpwi 0, r5, 0 - li r7, -(4*4+64*16) - subf r6, r6, r7 - stdux r1, r1, r6 + vxor v9, v9, v9 + vspltisw v3, 1 + lvx v4, r12, r8 + vsldoi v3, v3, v3, 1 + addi r5, r1, 4*4+8*16 + stvx v4, 0, r5 + stvx v9, r5, r7 + stvx v9, r5, r8 + stvx v9, r5, r9 + addi r5, r5, 4*16 + stvx v9, 0, r5 + stvx v9, r5, r7 + stvx v9, r5, r8 + stvx v3, r5, r9 + + lvx v4, 0, r4 + lvx v14, r4, r7 + + lvx v11, r4, r8 + vrlw v12, v14, v1 + vrlw v13, v14, v18 + + vxor v12, v12, v13 + vsrw v13, v14, v16 + vadduwm v5, v14, v2 + vxor v12, v12, v13 + vrlw v14, v11, v1 + vrlw v13, v11, v18 + vadduwm v4, v4, v12 + vxor v14, v14, v13 + vsrw v13, v11, v16 + stvx v4, r4, r10 + vxor v14, v14, v13 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v5, v5, v14 + + stvx v5, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v6, v6, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v6, v6, v13 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v6, v6, v11 + lvx v11, r4, r8 + + stvx v6, r4, r10 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v7, v7, v14 + + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v8, v8, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v8, v8, v13 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v8, v8, v11 + lvx v11, r4, r8 + + stvx v8, r4, r10 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v9, v9, v14 + + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v10, v10, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v10, v10, v13 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v3 + vadduwm v14, v14, v4 + vadduwm v10, v10, v11 + + lvx v2, r12, r7 + vxor v12, v12, v13 + vsrw v13, v9, v0 + stvx v10, r4, r10 + vxor v12, v12, v13 + vadduwm v14, v14, v12 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v4, v14, v2 + lvx v2, r12, r8 + vxor v12, v12, v13 + vsrw v13, v10, v0 + stvx v4, r4, r11 + vadduwm v5, v5, v2 + vxor v12, v12, v13 + vadduwm v5, v5, v12 - li r7, 1*16 - li r8, 2*16 - li r9, 3*16 - li r10, 4*16 - li r11, 5*16 - li r12, 6*16 - li r6, 7*16 - - bne 0, sha256_transform_4way_swap + vrlw v12, v4, v17 + vrlw v13, v4, v19 + addi r4, r4, 2*16 + stvx v5, r4, r10 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v11, v5, v17 + vxor v12, v12, v13 + vrlw v13, v5, v19 + vxor v11, v11, v13 + vsrw v13, v5, v0 + vadduwm v6, v6, v12 + vxor v11, v11, v13 + stvx v6, r4, r11 + vadduwm v7, v7, v11 - lvx v11, 0, r4 - lvx v1, r4, r7 - lvx v2, r4, r8 - lvx v3, r4, r9 - lvx v4, r4, r10 - lvx v5, r4, r11 - lvx v6, r4, r12 - lvx v7, r4, r6 - addi r5, r1, 4*4 - stvx v11, 0, r5 - stvx v1, r5, r7 - stvx v2, r5, r8 - stvx v3, r5, r9 - stvx v4, r5, r10 - stvx v5, r5, r11 - stvx v6, r5, r12 - stvx v7, r5, r6 - addi r4, r4, 8*16 - lvx v0, 0, r4 - lvx v4, r4, r7 - lvx v5, r4, r8 - lvx v6, r4, r9 - lvx v7, r4, r10 - lvx v8, r4, r11 - lvx v9, r4, r12 - lvx v10, r4, r6 - addi r4, r1, 4*4+8*16 - stvx v0, 0, r4 - stvx v4, r4, r7 - stvx v5, r4, r8 - stvx v6, r4, r9 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + addi r4, r4, 2*16 stvx v7, r4, r10 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v11, v7, v17 + vxor v12, v12, v13 + vrlw v13, v7, v19 + vxor v11, v11, v13 + vsrw v13, v7, v0 + vadduwm v8, v8, v12 + vxor v11, v11, v13 stvx v8, r4, r11 - stvx v9, r4, r12 - stvx v10, r4, r6 - b sha256_transform_4way_extend - -sha256_transform_4way_swap: -#ifdef _AIX - ld r5, T.br_perm(r2) -#else - lis r5, HI(br_perm) - addi r5, r5, LO(br_perm) -#endif - lvx v19, 0, r5 + vadduwm v9, v9, v11 - lvx v11, 0, r4 - lvx v1, r4, r7 - lvx v2, r4, r8 - lvx v3, r4, r9 - lvx v4, r4, r10 - lvx v5, r4, r11 - lvx v6, r4, r12 - lvx v7, r4, r6 - vperm v11, v11, v11, v19 - vperm v1, v1, v1, v19 - vperm v2, v2, v2, v19 - vperm v3, v3, v3, v19 - vperm v4, v4, v4, v19 - vperm v5, v5, v5, v19 - vperm v6, v6, v6, v19 - vperm v7, v7, v7, v19 - addi r5, r1, 4*4 - stvx v11, 0, r5 - stvx v1, r5, r7 - stvx v2, r5, r8 - stvx v3, r5, r9 - stvx v4, r5, r10 - stvx v5, r5, r11 - stvx v6, r5, r12 - stvx v7, r5, r6 - addi r4, r4, 8*16 - lvx v0, 0, r4 - lvx v4, r4, r7 - lvx v5, r4, r8 - lvx v6, r4, r9 - lvx v7, r4, r10 - lvx v8, r4, r11 - lvx v9, r4, r12 - lvx v10, r4, r6 - vperm v0, v0, v0, v19 - vperm v4, v4, v4, v19 - vperm v5, v5, v5, v19 - vperm v6, v6, v6, v19 - vperm v7, v7, v7, v19 - vperm v8, v8, v8, v19 - vperm v9, v9, v9, v19 - vperm v10, v10, v10, v19 - addi r4, r1, 4*4+8*16 - stvx v0, 0, r4 - stvx v4, r4, r7 - stvx v5, r4, r8 - stvx v6, r4, r9 + lvx v2, r12, r9 + vrlw v14, v8, v17 + vrlw v13, v8, v19 + vrlw v12, v9, v17 + addi r4, r4, 2*16 + stvx v9, r4, r10 + vxor v14, v14, v13 + vrlw v13, v9, v19 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vxor v14, v14, v13 + vsrw v13, v9, v0 + vxor v12, v12, v13 + vadduwm v4, v4, v2 + vadduwm v10, v10, v14 + vadduwm v4, v4, v12 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v11, r4, r8 + + vadduwm v5, v5, v3 + stvx v4, r4, r10 + vrlw v14, v11, v1 + vrlw v13, v11, v18 + vrlw v12, v10, v17 + vxor v14, v14, v13 + vrlw v13, v10, v19 + vxor v12, v12, v13 + vsrw v13, v11, v16 + vxor v14, v14, v13 + vsrw v13, v10, v0 + vxor v12, v12, v13 + vadduwm v5, v5, v14 + vadduwm v5, v5, v12 + stvx v5, r4, r11 + addi r4, r4, 2*16 + + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 + + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r4, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r4, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r4, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r4, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v7, v7, v13 + vrlw v13, v6, v19 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 stvx v7, r4, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 stvx v8, r4, r11 - stvx v9, r4, r12 - stvx v10, r4, r6 - -sha256_transform_4way_extend: - li r10, 16*16 - li r11, 17*16 - sha256_4way_extend_setup - sha256_4way_extend_doubleround 0, r5, v4, v5, v9, v10 - sha256_4way_extend_doubleround 2, r5, v6, v7, v4, v5 - sha256_4way_extend_doubleround 4, r5, v8, v9, v6, v7 - sha256_4way_extend_doubleround 6, r5, v10, v4, v8, v9 - sha256_4way_extend_doubleround 8, r5, v5, v6, v10, v4 - sha256_4way_extend_doubleround 10, r5, v7, v8, v5, v6 - sha256_4way_extend_doubleround 12, r5, v9, v10, v7, v8 - sha256_4way_extend_doubleround 14, r5, v4, v5, v9, v10 - sha256_4way_extend_doubleround 16, r5, v6, v7, v4, v5 - sha256_4way_extend_doubleround 18, r5, v8, v9, v6, v7 - sha256_4way_extend_doubleround 20, r5, v10, v4, v8, v9 - sha256_4way_extend_doubleround 22, r5, v5, v6, v10, v4 - sha256_4way_extend_doubleround 24, r5, v7, v8, v5, v6 - sha256_4way_extend_doubleround 26, r5, v9, v10, v7, v8 - sha256_4way_extend_doubleround 28, r5, v4, v5, v9, v10 - sha256_4way_extend_doubleround 30, r5, v6, v7, v4, v5 - sha256_4way_extend_doubleround 32, r5, v8, v9, v6, v7 - sha256_4way_extend_doubleround 34, r5, v10, v4, v8, v9 - sha256_4way_extend_doubleround 36, r5, v5, v6, v10, v4 - sha256_4way_extend_doubleround 38, r5, v7, v8, v5, v6 - sha256_4way_extend_doubleround 40, r5, v9, v10, v7, v8 - sha256_4way_extend_doubleround 42, r5, v4, v5, v9, v10 - sha256_4way_extend_doubleround 44, r5, v6, v7, v4, v5 - sha256_4way_extend_doubleround 46, r5, v8, v9, v6, v7 - - addi r11, r3, 4*16 - lvx v4, 0, r3 - lvx v5, r3, r7 - lvx v6, r3, r8 - lvx v7, r3, r9 - lvx v8, 0, r11 - lvx v9, r11, r7 - lvx v10, r11, r8 - lvx v11, r11, r9 -#ifdef _AIX - ld r12, T.sha256_4k(r2) -#else - lis r12, HI(sha256_4k) - addi r12, r12, LO(sha256_4k) -#endif - addi r5, r1, 4*4 - sha256_4way_main_setup - sha256_4way_main_quadround 0, r12, r5 - sha256_4way_main_quadround 4, r12, r5 - sha256_4way_main_quadround 8, r12, r5 - sha256_4way_main_quadround 12, r12, r5 - sha256_4way_main_quadround 16, r12, r5 - sha256_4way_main_quadround 20, r12, r5 - sha256_4way_main_quadround 24, r12, r5 - sha256_4way_main_quadround 28, r12, r5 - sha256_4way_main_quadround 32, r12, r5 - sha256_4way_main_quadround 36, r12, r5 - sha256_4way_main_quadround 40, r12, r5 - sha256_4way_main_quadround 44, r12, r5 - sha256_4way_main_quadround 48, r12, r5 - sha256_4way_main_quadround 52, r12, r5 - sha256_4way_main_quadround 56, r12, r5 - sha256_4way_main_quadround 60, r12, r5 - - lvx v12, 0, r3 - lvx v13, r3, r7 - lvx v14, r3, r8 - lvx v15, r3, r9 - lvx v16, 0, r11 - lvx v17, r11, r7 - lvx v18, r11, r8 - lvx v19, r11, r9 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v11, v11, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v9, v9, v13 + vrlw v13, v8, v19 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v9, r4, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 vadduwm v4, v4, v12 - vadduwm v5, v5, v13 - vadduwm v6, v6, v14 - vadduwm v7, v7, v15 - vadduwm v8, v8, v16 - vadduwm v9, v9, v17 - vadduwm v10, v10, v18 - vadduwm v11, v11, v19 - stvx v4, 0, r3 - stvx v5, r3, r7 - stvx v6, r3, r8 - stvx v7, r3, r9 - stvx v8, 0, r11 - stvx v9, r11, r7 - stvx v10, r11, r8 - stvx v11, r11, r9 - - ld r1, 0(r1) - mtspr 256, r0 - blr + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r4, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v11, v11, v6 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v6, v6, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v6, v6, v13 + vrlw v13, v5, v19 + vadduwm v6, v6, v12 - .align 2 - .globl sha256d_ms_4way - .globl _sha256d_ms_4way - .globl .sha256d_ms_4way -#ifdef __ELF__ - .type sha256d_ms_4way, %function -#endif -sha256d_ms_4way: -_sha256d_ms_4way: -.sha256d_ms_4way: - mfspr r0, 256 - oris r12, r0, 0xffff - ori r12, r12, 0xf000 - mtspr 256, r12 - - andi. r12, r1, 15 - li r11, -(4*4+64*16) - subf r12, r12, r11 - stdux r1, r1, r12 - - li r7, 1*16 - li r8, 2*16 - li r9, 3*16 - li r10, 16*16 - li r11, 17*16 - - sha256_4way_extend_setup - + vrlw v12, v5, v17 + vadduwm v14, v14, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v6, r4, r10 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vadduwm v7, v7, v14 + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v11, v11, v8 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v8, v8, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v8, v8, v13 + vrlw v13, v7, v19 + vadduwm v8, v8, v12 + + vrlw v12, v7, v17 + vadduwm v14, v14, v9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v8, r4, r10 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vadduwm v9, v9, v14 + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v11, v11, v10 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v10, v10, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v10, v10, v13 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vrlw v12, v9, v17 + vadduwm v14, v14, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v10, r4, r10 + vxor v4, v4, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v4, v4, v13 + vadduwm v4, v4, v14 + stvx v4, r4, r11 addi r4, r4, 2*16 - addi r12, r1, 4*4+18*16 lvx v14, r4, r7 - lvx v6, r4, r10 - lvx v7, r4, r11 - - vrlw v12, v14, v1 - vrlw v13, v14, v18 - stvx v6, 0, r12 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v11, v11, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v14, v1 vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v5, v5, v13 vsrw v13, v14, v16 - stvx v7, r12, r7 + lvx v11, r4, r8 + vxor v5, v5, v13 + vrlw v13, v4, v19 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vadduwm v14, v14, v6 vxor v12, v12, v13 - vadduwm v6, v6, v12 - vadduwm v7, v7, v14 - stvx v6, r4, r10 - - vrlw v12, v6, v17 + vsrw v13, v4, v0 + vrlw v6, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v5, r4, r10 + vxor v6, v6, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v6, v6, v13 + vadduwm v6, v6, v14 + stvx v6, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v11, v11, v7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v7, v7, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v7, v7, v13 vrlw v13, v6, v19 - stvx v7, r4, r11 - addi r4, r4, 18*16 - lvx v8, 0, r4 + vadduwm v7, v7, v12 + + vrlw v12, v6, v17 + vadduwm v14, v14, v8 vxor v12, v12, v13 vsrw v13, v6, v0 - stvx v8, r12, r8 + vrlw v8, v11, v1 vxor v12, v12, v13 - vadduwm v8, v8, v12 - - vrlw v9, v7, v17 + vrlw v13, v11, v18 + stvx v7, r4, r10 + vxor v8, v8, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v8, v8, v13 + vadduwm v8, v8, v14 + stvx v8, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v7, v17 vrlw v13, v7, v19 - stvx v8, 0, r4 - vxor v9, v9, v13 + vadduwm v11, v11, v9 + vxor v12, v12, v13 vsrw v13, v7, v0 + vrlw v9, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v9, v9, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 vxor v9, v9, v13 - - vrlw v12, v8, v17 vrlw v13, v8, v19 - stvx v9, r4, r7 + vadduwm v9, v9, v12 + + vrlw v12, v8, v17 + vadduwm v14, v14, v10 vxor v12, v12, v13 vsrw v13, v8, v0 - lvx v10, r4, r8 - lvx v4, r4, r9 + vrlw v10, v11, v1 vxor v12, v12, v13 - stvx v10, r12, r9 - addi r12, r12, 4*16 - stvx v4, 0, r12 - vrlw v14, v9, v17 + vrlw v13, v11, v18 + stvx v9, r4, r10 + vxor v10, v10, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v10, v10, v13 + vadduwm v10, v10, v14 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vrlw v12, v9, v17 vrlw v13, v9, v19 - vadduwm v10, v10, v12 + vadduwm v11, v11, v4 + vxor v12, v12, v13 + vsrw v13, v9, v0 + vrlw v4, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor v4, v4, v13 + vsrw v13, v14, v16 + lvx v11, r4, r8 + vxor v4, v4, v13 + vrlw v13, v10, v19 + vadduwm v4, v4, v12 + + vrlw v12, v10, v17 + vadduwm v14, v14, v5 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v5, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx v4, r4, r10 + vxor v5, v5, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v5, v5, v13 + vadduwm v5, v5, v14 + stvx v5, r4, r11 + addi r4, r4, 2*16 + + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v15, v11, v6 + vrlw v6, v14, v1 + vrlw v11, v14, v18 + vxor v12, v12, v13 + vxor v6, v6, v11 + vsrw v13, v4, v0 + vsrw v14, v14, v16 + vxor v12, v12, v13 + vxor v6, v6, v14 + vadduwm v12, v12, v15 + vadduwm v6, v6, v12 + stvx v6, r4, r10 + addi r4, r4, -44*16 +#ifdef _AIX + ld r5, T.sha256_4h(r2) +#else + lis r5, HI(sha256_4h) + addi r5, r5, LO(sha256_4h) +#endif + lvx v4, 0, r5 + lvx v5, r5, r7 + lvx v6, r5, r8 + lvx v7, r5, r9 + addi r12, r5, 4*16 + lvx v8, 0, r12 + lvx v9, r12, r7 + lvx v10, r12, r8 + lvx v11, r12, r9 +#ifdef _AIX + ld r12, T.sha256_4k(r2) +#else + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) +#endif + vspltisw v2, 12 + vspltisw v3, -5 + vspltisw v16, -6 + vspltisw v17, -11 + vspltisw v18, -2 + li r6, (0+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 vxor v14, v14, v13 - vsrw v13, v9, v0 - stvx v10, r4, r8 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (0+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (0+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (0+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (4+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (4+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (4+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (4+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (8+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 vxor v14, v14, v13 - vadduwm v4, v4, v14 - - vrlw v12, v10, v17 - vrlw v13, v10, v19 - stvx v4, r4, r9 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vsrw v13, v10, v0 - vrlw v14, v4, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (8+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vrlw v13, v4, v19 - addi r4, r4, 4*16 - lvx v5, 0, r4 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (8+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 vxor v14, v14, v13 - stvx v5, r12, r7 - vsrw v13, v4, v0 - vadduwm v5, v5, v12 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (8+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 vxor v14, v14, v13 - stvx v5, 0, r4 - vadduwm v6, v6, v14 - - vrlw v12, v5, v17 - vrlw v13, v5, v19 - stvx v6, r4, r7 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vsrw v13, v5, v0 - vrlw v14, v6, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (12+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vrlw v13, v6, v19 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (12+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 vxor v14, v14, v13 - vsrw v13, v6, v0 - vadduwm v7, v7, v12 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (12+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 vxor v14, v14, v13 - stvx v7, r4, r8 - vadduwm v8, v8, v14 - - vrlw v12, v7, v17 - vrlw v13, v7, v19 - stvx v8, r4, r9 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vsrw v13, v7, v0 - vrlw v14, v8, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (12+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (16+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 vrlw v13, v8, v19 + vadduwm v11, v11, v12 vxor v14, v14, v13 - vsrw v13, v8, v0 - vadduwm v9, v9, v12 - vxor v14, v14, v13 - addi r4, r4, 4*16 - stvx v9, 0, r4 - vadduwm v10, v10, v14 - - vrlw v12, v9, v17 - vrlw v13, v9, v19 - stvx v10, r4, r7 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vsrw v13, v9, v0 - lvx v11, r4, r8 - lvx v14, r4, r9 - stvx v11, r12, r8 - stvx v14, r12, r9 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (16+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vadduwm v11, v11, v12 - vadduwm v5, v5, v14 - vrlw v12, v10, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (16+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 vrlw v13, v10, v19 - vadduwm v4, v4, v11 - + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vsrw v13, v10, v0 - stvx v4, r4, r8 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (16+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vadduwm v5, v5, v12 - stvx v5, r4, r9 - addi r4, r4, -12*16 - lvx v11, 0, r4 - - sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5 - sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7 - sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9 - sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4 - sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6 - sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8 - sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10 - sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5 - sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7 - sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9 - sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4 - sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6 - sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8 - sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10 - sha256_4way_extend_doubleround 44, r4, v6, v7, v4, v5 - sha256_4way_extend_doubleround 46, r4, v8, v9, v6, v7 - addi r4, r4, -48*16 - - lvx v4, 0, r6 - lvx v9, r6, r7 - lvx v10, r6, r8 - lvx v11, r6, r9 - addi r12, r6, 4*16 - lvx v8, 0, r12 - lvx v5, r12, r7 - lvx v6, r12, r8 - lvx v7, r12, r9 -#ifdef _AIX - ld r12, T.sha256_4k(r2) -#else - lis r12, HI(sha256_4k) - addi r12, r12, LO(sha256_4k) -#endif - sha256_4way_main_setup - sha256_4way_main_round 3, r12, r4, v5, v6, v7, v4, v9, v10, v11, v8 - sha256_4way_main_quadround 4, r12, r4 - sha256_4way_main_quadround 8, r12, r4 - sha256_4way_main_quadround 12, r12, r4 - sha256_4way_main_quadround 16, r12, r4 - sha256_4way_main_quadround 20, r12, r4 - sha256_4way_main_quadround 24, r12, r4 - sha256_4way_main_quadround 28, r12, r4 - sha256_4way_main_quadround 32, r12, r4 - sha256_4way_main_quadround 36, r12, r4 - sha256_4way_main_quadround 40, r12, r4 - sha256_4way_main_quadround 44, r12, r4 - sha256_4way_main_quadround 48, r12, r4 - sha256_4way_main_quadround 52, r12, r4 - sha256_4way_main_quadround 56, r12, r4 - sha256_4way_main_quadround 60, r12, r4 - - lvx v12, 0, r5 - lvx v13, r5, r7 - lvx v14, r5, r8 - lvx v15, r5, r9 - addi r12, r5, 4*16 - lvx v16, 0, r12 - lvx v17, r12, r7 - lvx v18, r12, r8 - lvx v19, r12, r9 - vadduwm v4, v4, v12 - vadduwm v5, v5, v13 - vadduwm v6, v6, v14 - vadduwm v7, v7, v15 - vadduwm v8, v8, v16 - vadduwm v9, v9, v17 - vadduwm v10, v10, v18 - vadduwm v11, v11, v19 - addi r12, r1, 4*4 - stvx v4, 0, r12 - stvx v5, r12, r7 - stvx v6, r12, r8 - stvx v7, r12, r9 - addi r12, r12, 4*16 - stvx v8, 0, r12 - stvx v9, r12, r7 - stvx v10, r12, r8 - stvx v11, r12, r9 - - addi r12, r1, 4*4+18*16 - lvx v4, 0, r12 - lvx v5, r12, r7 - lvx v6, r12, r8 - lvx v7, r12, r9 - addi r12, r12, 4*16 - lvx v8, 0, r12 - lvx v9, r12, r7 - lvx v10, r12, r8 - lvx v11, r12, r9 - addi r12, r4, 18*16 - stvx v4, 0, r12 - stvx v5, r12, r7 - stvx v6, r12, r8 - addi r12, r4, 22*16 - stvx v7, 0, r12 - stvx v8, r12, r7 - stvx v9, r12, r8 - addi r12, r4, 30*16 - stvx v10, 0, r12 - stvx v11, r12, r7 - - addi r4, r1, 4*4 - - sha256_4way_extend_setup - -#ifdef _AIX - ld r12, T.sha256d_4preext2(r2) -#else - lis r12, HI(sha256d_4preext2) - addi r12, r12, LO(sha256d_4preext2) -#endif - lvx v2, 0, r12 - - vxor v9, v9, v9 - vspltisw v3, 1 - lvx v4, r12, r8 - vsldoi v3, v3, v3, 1 - addi r5, r1, 4*4+8*16 - stvx v4, 0, r5 - stvx v9, r5, r7 - stvx v9, r5, r8 - stvx v9, r5, r9 - addi r5, r5, 4*16 - stvx v9, 0, r5 - stvx v9, r5, r7 - stvx v9, r5, r8 - stvx v3, r5, r9 - - lvx v4, 0, r4 - lvx v14, r4, r7 - - lvx v11, r4, r8 - vrlw v12, v14, v1 - vrlw v13, v14, v18 - + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (20+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vsrw v13, v14, v16 - vadduwm v5, v14, v2 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (20+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vrlw v14, v11, v1 - vrlw v13, v11, v18 - vadduwm v4, v4, v12 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (20+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 vxor v14, v14, v13 - vsrw v13, v11, v16 - stvx v4, r4, r10 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (20+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 vxor v14, v14, v13 - vrlw v12, v4, v17 - vrlw v13, v4, v19 - vadduwm v5, v5, v14 - - stvx v5, r4, r11 - addi r4, r4, 2*16 - lvx v14, r4, r7 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vsrw v13, v4, v0 - vrlw v6, v14, v1 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (24+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vrlw v13, v14, v18 - vxor v6, v6, v13 - vsrw v13, v14, v16 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (24+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (24+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (24+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (28+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 vadduwm v11, v11, v12 - vxor v6, v6, v13 - vrlw v12, v5, v17 - vrlw v13, v5, v19 - vadduwm v6, v6, v11 - lvx v11, r4, r8 - - stvx v6, r4, r10 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vsrw v13, v5, v0 - vrlw v7, v11, v1 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (28+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vrlw v13, v11, v18 - vxor v7, v7, v13 - vsrw v13, v11, v16 - vadduwm v14, v14, v12 - vxor v7, v7, v13 - vrlw v12, v6, v17 - vrlw v13, v6, v19 - vadduwm v7, v7, v14 - - stvx v7, r4, r11 - addi r4, r4, 2*16 - lvx v14, r4, r7 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (28+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vsrw v13, v6, v0 - vrlw v8, v14, v1 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (28+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vrlw v13, v14, v18 - vxor v8, v8, v13 - vsrw v13, v14, v16 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (32+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 vadduwm v11, v11, v12 - vxor v8, v8, v13 - vrlw v12, v7, v17 - vrlw v13, v7, v19 - vadduwm v8, v8, v11 - lvx v11, r4, r8 - - stvx v8, r4, r10 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vsrw v13, v7, v0 - vrlw v9, v11, v1 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (32+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vrlw v13, v11, v18 - vxor v9, v9, v13 - vsrw v13, v11, v16 - vadduwm v14, v14, v12 - vxor v9, v9, v13 - vrlw v12, v8, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (32+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (32+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (36+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (36+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (36+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 vadduwm v9, v9, v14 - - stvx v9, r4, r11 - addi r4, r4, 2*16 - lvx v14, r4, r7 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (36+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (40+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (40+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (40+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vsrw v13, v8, v0 - vrlw v10, v14, v1 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (40+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vrlw v13, v14, v18 - vxor v10, v10, v13 - vsrw v13, v14, v16 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (44+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 vadduwm v11, v11, v12 - vxor v10, v10, v13 - vrlw v12, v9, v17 - vrlw v13, v9, v19 - vadduwm v11, v11, v3 - vadduwm v14, v14, v4 - vadduwm v10, v10, v11 - - lvx v2, r12, r7 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vsrw v13, v9, v0 - stvx v10, r4, r10 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (44+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vadduwm v14, v14, v12 - vrlw v12, v10, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (44+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 vrlw v13, v10, v19 - vadduwm v4, v14, v2 - lvx v2, r12, r8 - vxor v12, v12, v13 - vsrw v13, v10, v0 - stvx v4, r4, r11 - vadduwm v5, v5, v2 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vadduwm v5, v5, v12 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (44+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 - vrlw v12, v4, v17 - vrlw v13, v4, v19 - addi r4, r4, 2*16 - stvx v5, r4, r10 + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vsrw v13, v4, v0 - vrlw v11, v5, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (48+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vrlw v13, v5, v19 - vxor v11, v11, v13 - vsrw v13, v5, v0 - vadduwm v6, v6, v12 - vxor v11, v11, v13 - stvx v6, r4, r11 - vadduwm v7, v7, v11 - - vrlw v12, v6, v17 - vrlw v13, v6, v19 - addi r4, r4, 2*16 - stvx v7, r4, r10 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (48+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vsrw v13, v6, v0 - vrlw v11, v7, v17 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (48+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vrlw v13, v7, v19 - vxor v11, v11, v13 - vsrw v13, v7, v0 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (48+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 vadduwm v8, v8, v12 - vxor v11, v11, v13 - stvx v8, r4, r11 - vadduwm v9, v9, v11 - - lvx v2, r12, r9 - vrlw v14, v8, v17 - vrlw v13, v8, v19 - vrlw v12, v9, v17 - addi r4, r4, 2*16 - stvx v9, r4, r10 vxor v14, v14, v13 - vrlw v13, v9, v19 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vsrw v13, v8, v0 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (52+0)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 vxor v14, v14, v13 - vsrw v13, v9, v0 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vadduwm v4, v4, v2 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + li r6, (52+1)*16 + lvx v12, r4, r6 + vand v13, v8, v11 + vandc v14, v9, v11 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v11, v3 vadduwm v10, v10, v14 - vadduwm v4, v4, v12 - stvx v10, r4, r11 - addi r4, r4, 2*16 - lvx v11, r4, r8 - - vadduwm v5, v5, v3 - stvx v4, r4, r10 - vrlw v14, v11, v1 - vrlw v13, v11, v18 - vrlw v12, v10, v17 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 vxor v14, v14, v13 - vrlw v13, v10, v19 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vxor v15, v7, v4 + vadduwm v10, v10, v13 + + vrlw v13, v7, v17 + vand v15, v15, v5 + vxor v12, v7, v13 + vrlw v13, v7, v2 + vand v14, v7, v4 vxor v12, v12, v13 - vsrw v13, v11, v16 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v10, v14 + vadduwm v10, v10, v6 + vadduwm v6, v15, v13 + li r6, (52+2)*16 + lvx v12, r4, r6 + vand v13, v11, v10 + vandc v14, v8, v10 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 vxor v14, v14, v13 - vsrw v13, v10, v0 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vxor v15, v6, v7 + vadduwm v9, v9, v13 + + vrlw v13, v6, v17 + vand v15, v15, v4 + vxor v12, v6, v13 + vrlw v13, v6, v2 + vand v14, v6, v7 vxor v12, v12, v13 - vadduwm v5, v5, v14 - vadduwm v5, v5, v12 - stvx v5, r4, r11 - addi r4, r4, 2*16 - - sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5 - sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7 - sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9 - sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4 - sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6 - sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8 - sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10 - sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5 - sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7 - sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9 - sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4 - sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6 - sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8 - sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10 - - lvx v14, r4, r7 - vrlw v12, v4, v17 - vrlw v13, v4, v19 - vadduwm v15, v11, v6 - vrlw v6, v14, v1 - vrlw v11, v14, v18 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v9, v14 + vadduwm v9, v9, v5 + vadduwm v5, v15, v13 + li r6, (52+3)*16 + lvx v12, r4, r6 + vand v13, v10, v9 + vandc v14, v11, v9 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vxor v15, v5, v6 + vadduwm v8, v8, v13 + + vrlw v13, v5, v17 + vand v15, v15, v7 + vxor v12, v5, v13 + vrlw v13, v5, v2 + vand v14, v5, v6 vxor v12, v12, v13 - vxor v6, v6, v11 - vsrw v13, v4, v0 - vsrw v14, v14, v16 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v8, v14 + vadduwm v8, v8, v4 + vadduwm v4, v15, v13 + li r6, (56)*16 + lvx v12, r4, r6 + vand v13, v9, v8 + vandc v14, v10, v8 + lvx v15, r12, r6 + vor v14, v14, v13 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vxor v15, v4, v5 + vadduwm v11, v11, v13 + + vrlw v13, v4, v17 + vand v15, v15, v6 + vxor v12, v4, v13 + vrlw v13, v4, v2 + vand v14, v4, v5 vxor v12, v12, v13 - vxor v6, v6, v14 - vadduwm v12, v12, v15 - vadduwm v6, v6, v12 - stvx v6, r4, r10 - addi r4, r4, -44*16 - -#ifdef _AIX - ld r5, T.sha256_4h(r2) -#else - lis r5, HI(sha256_4h) - addi r5, r5, LO(sha256_4h) -#endif - lvx v4, 0, r5 - lvx v5, r5, r7 - lvx v6, r5, r8 - lvx v7, r5, r9 - addi r12, r5, 4*16 - lvx v8, 0, r12 - lvx v9, r12, r7 - lvx v10, r12, r8 - lvx v11, r12, r9 -#ifdef _AIX - ld r12, T.sha256_4k(r2) -#else - lis r12, HI(sha256_4k) - addi r12, r12, LO(sha256_4k) -#endif - sha256_4way_main_setup - sha256_4way_main_quadround 0, r12, r4 - sha256_4way_main_quadround 4, r12, r4 - sha256_4way_main_quadround 8, r12, r4 - sha256_4way_main_quadround 12, r12, r4 - sha256_4way_main_quadround 16, r12, r4 - sha256_4way_main_quadround 20, r12, r4 - sha256_4way_main_quadround 24, r12, r4 - sha256_4way_main_quadround 28, r12, r4 - sha256_4way_main_quadround 32, r12, r4 - sha256_4way_main_quadround 36, r12, r4 - sha256_4way_main_quadround 40, r12, r4 - sha256_4way_main_quadround 44, r12, r4 - sha256_4way_main_quadround 48, r12, r4 - sha256_4way_main_quadround 52, r12, r4 - sha256_4way_main_round 56, r12, r4, v4, v5, v6, v7, v8, v9, v10, v11 - -.macro sha256_4way_main_round_red i, rk, rw, vd, ve, vf, vg, vh - li r6, (\i)*16 - vand v15, \vf, \ve - vandc v14, \vg, \ve - lvx v12, \rw, r6 - vadduwm \vh, \vh, \vd + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, v11, v14 + vadduwm v11, v11, v7 + vadduwm v7, v15, v13 + + + li r6, (57)*16 + vand v15, v8, v11 + vandc v14, v9, v11 + lvx v12, r4, r6 + vadduwm v10, v10, v6 + vor v14, v14, v15 + lvx v15, r12, r6 + vrlw v13, v11, v3 + vadduwm v10, v10, v14 + vxor v14, v11, v13 + vrlw v13, v11, v19 + vadduwm v10, v10, v12 + vxor v14, v14, v13 + vadduwm v10, v10, v15 + vrlw v13, v14, v16 + vadduwm v10, v10, v13 + li r6, (58)*16 + vand v15, v11, v10 + vandc v14, v8, v10 + lvx v12, r4, r6 + vadduwm v9, v9, v5 vor v14, v14, v15 - lvx v15, \rk, r6 - vrlw v13, \ve, v3 - vadduwm \vh, \vh, v14 - vxor v14, \ve, v13 - vrlw v13, \ve, v19 - vadduwm \vh, \vh, v12 - vxor v14, v14, v13 - vadduwm \vh, \vh, v15 - vrlw v13, v14, v16 - vadduwm \vh, \vh, v13 -.endm - - sha256_4way_main_round_red 57, r12, r4, v6, v11, v8, v9, v10 - sha256_4way_main_round_red 58, r12, r4, v5, v10, v11, v8, v9 - sha256_4way_main_round_red 59, r12, r4, v4, v9, v10, v11, v8 - sha256_4way_main_round_red 60, r12, r4, v7, v8, v9, v10, v11 + lvx v15, r12, r6 + vrlw v13, v10, v3 + vadduwm v9, v9, v14 + vxor v14, v10, v13 + vrlw v13, v10, v19 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + vadduwm v9, v9, v15 + vrlw v13, v14, v16 + vadduwm v9, v9, v13 + li r6, (59)*16 + vand v15, v10, v9 + vandc v14, v11, v9 + lvx v12, r4, r6 + vadduwm v8, v8, v4 + vor v14, v14, v15 + lvx v15, r12, r6 + vrlw v13, v9, v3 + vadduwm v8, v8, v14 + vxor v14, v9, v13 + vrlw v13, v9, v19 + vadduwm v8, v8, v12 + vxor v14, v14, v13 + vadduwm v8, v8, v15 + vrlw v13, v14, v16 + vadduwm v8, v8, v13 + li r6, (60)*16 + vand v15, v9, v8 + vandc v14, v10, v8 + lvx v12, r4, r6 + vadduwm v11, v11, v7 + vor v14, v14, v15 + lvx v15, r12, r6 + vrlw v13, v8, v3 + vadduwm v11, v11, v14 + vxor v14, v8, v13 + vrlw v13, v8, v19 + vadduwm v11, v11, v12 + vxor v14, v14, v13 + vadduwm v11, v11, v15 + vrlw v13, v14, v16 + vadduwm v11, v11, v13 li r12, 7*16 lvx v19, r5, r12 diff --git a/sha2-ppc.S.orig b/sha2-ppc.S.orig new file mode 100644 index 000000000..a0b60d2ac --- /dev/null +++ b/sha2-ppc.S.orig @@ -0,0 +1,2007 @@ +/* + * Copyright 2014-2015 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) + +#ifdef __APPLE__ + +#define HI(name) ha16(name) +#define LO(name) lo16(name) + +#else + +#define HI(name) name@ha +#define LO(name) name@l + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#ifdef __ALTIVEC__ +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#endif + +#endif + +#if !(defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) || \ + defined(__64BIT__) || defined(_LP64) || defined(__LP64__)) +#define ld lwz +#define std stw +#define stdu stwu +#define stdux stwux +#endif + + +#ifdef _AIX + .csect .text[RO] +#else + .data +#endif + .align 2 +sha256_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + + .align 2 +sha256_k: + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +#ifdef _AIX + .toc +T.sha256_h: + .tc sha256_h[TC], sha256_h +T.sha256_k: + .tc sha256_k[TC], sha256_k +#endif + + +.macro sha256_extend_doubleround i, rw, wo, ra, rb, ry, rz + lwz r14, \wo+(\i+1)*4(\rw) + rotrwi r12, \ry, 17 + rotrwi r13, \ry, 19 + add r11, r11, \ra + xor r12, r12, r13 + srwi r13, \ry, 10 + rotrwi \ra, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor \ra, \ra, r13 + srwi r13, r14, 3 + lwz r11, \wo+(\i+2)*4(\rw) + xor \ra, \ra, r13 + rotrwi r13, \rz, 19 + add \ra, \ra, r12 + + rotrwi r12, \rz, 17 + add r14, r14, \rb + xor r12, r12, r13 + srwi r13, \rz, 10 + rotrwi \rb, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw \ra, \wo+(\i+16)*4(\rw) + xor \rb, \rb, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor \rb, \rb, r13 + add \rb, \rb, r14 + stw \rb, \wo+(\i+17)*4(\rw) +.endm + + +.macro sha256_main_round i, rk, rw, wo, ra, rb, rc, rd, re, rf, rg, rh + lwz r12, \wo+(\i)*4(\rw) + and r13, \rf, \re + andc r14, \rg, \re + lwz r15, (\i)*4(\rk) + or r14, r14, r13 + rotrwi r13, \re, 5 + add \rh, \rh, r14 + xor r14, \re, r13 + rotrwi r13, \re, 19 + add \rh, \rh, r12 + xor r14, r14, r13 + add \rh, \rh, r15 + rotrwi r13, r14, 6 + xor r15, \ra, \rb + add \rh, \rh, r13 + + rotrwi r13, \ra, 11 + and r15, r15, \rc + xor r12, \ra, r13 + rotrwi r13, \ra, 20 + and r14, \ra, \rb + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, \rh, r14 + add \rh, \rh, \rd + add \rd, r15, r13 +.endm + +.macro sha256_main_quadround i, rk, rw, wo + sha256_main_round \i+0, \rk, \rw, \wo, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round \i+1, \rk, \rw, \wo, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round \i+2, \rk, \rw, \wo, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round \i+3, \rk, \rw, \wo, r5, r6, r7, r4, r9, r10, r11, r8 +.endm + + +#ifdef _AIX + .csect .text[PR] +#else + .text +#endif + .align 2 + .globl sha256_transform + .globl _sha256_transform + .globl .sha256_transform +#ifdef __ELF__ + .type sha256_transform, %function +#endif +sha256_transform: +_sha256_transform: +.sha256_transform: + stdu r1, -76*4(r1) + cmpwi 0, r5, 0 + std r13, 2*4(r1) + std r14, 4*4(r1) + std r15, 6*4(r1) + std r16, 72*4(r1) + + bne 0, sha256_transform_swap + + lwz r11, 0*4(r4) + lwz r14, 1*4(r4) + lwz r15, 2*4(r4) + lwz r7, 3*4(r4) + lwz r8, 4*4(r4) + lwz r9, 5*4(r4) + lwz r10, 6*4(r4) + lwz r0, 7*4(r4) + lwz r12, 8*4(r4) + lwz r13, 9*4(r4) + lwz r5, 10*4(r4) + lwz r6, 11*4(r4) + stw r11, 8*4+0*4(r1) + stw r14, 8*4+1*4(r1) + stw r15, 8*4+2*4(r1) + stw r7, 8*4+3*4(r1) + stw r8, 8*4+4*4(r1) + stw r9, 8*4+5*4(r1) + stw r10, 8*4+6*4(r1) + stw r0, 8*4+7*4(r1) + stw r12, 8*4+8*4(r1) + stw r13, 8*4+9*4(r1) + stw r5, 8*4+10*4(r1) + stw r6, 8*4+11*4(r1) + lwz r7, 12*4(r4) + lwz r8, 13*4(r4) + lwz r9, 14*4(r4) + lwz r10, 15*4(r4) + mr r4, r13 + stw r7, 8*4+12*4(r1) + stw r8, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + b sha256_transform_extend + +sha256_transform_swap: + li r13, 1*4 + li r14, 2*4 + li r15, 3*4 + lwbrx r11, 0, r4 + lwbrx r7, r4, r13 + lwbrx r8, r4, r14 + lwbrx r9, r4, r15 + addi r4, r4, 4*4 + stw r11, 8*4+0*4(r1) + stw r7, 8*4+1*4(r1) + stw r8, 8*4+2*4(r1) + stw r9, 8*4+3*4(r1) + lwbrx r7, 0, r4 + lwbrx r8, r4, r13 + lwbrx r9, r4, r14 + lwbrx r10, r4, r15 + addi r4, r4, 4*4 + stw r7, 8*4+4*4(r1) + stw r8, 8*4+5*4(r1) + stw r9, 8*4+6*4(r1) + stw r10, 8*4+7*4(r1) + lwbrx r8, 0, r4 + lwbrx r12, r4, r13 + lwbrx r5, r4, r14 + lwbrx r6, r4, r15 + addi r4, r4, 4*4 + stw r8, 8*4+8*4(r1) + stw r12, 8*4+9*4(r1) + stw r5, 8*4+10*4(r1) + stw r6, 8*4+11*4(r1) + lwbrx r7, 0, r4 + lwbrx r8, r4, r13 + lwbrx r9, r4, r14 + lwbrx r10, r4, r15 + mr r4, r12 + stw r7, 8*4+12*4(r1) + stw r8, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + +sha256_transform_extend: + sha256_extend_doubleround 0, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 2, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 4, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 6, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 8, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 10, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 12, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 14, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 44, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 46, r1, 8*4, r8, r9, r6, r7 + + lwz r4, 0*4(r3) + lwz r5, 1*4(r3) + lwz r6, 2*4(r3) + lwz r7, 3*4(r3) + lwz r8, 4*4(r3) + lwz r9, 5*4(r3) + lwz r10, 6*4(r3) + lwz r11, 7*4(r3) +#ifdef _AIX + ld r16, T.sha256_k(r2) +#else + lis r16, HI(sha256_k) + addi r16, r16, LO(sha256_k) +#endif + sha256_main_quadround 0, r16, r1, 8*4 + sha256_main_quadround 4, r16, r1, 8*4 + sha256_main_quadround 8, r16, r1, 8*4 + sha256_main_quadround 12, r16, r1, 8*4 + sha256_main_quadround 16, r16, r1, 8*4 + sha256_main_quadround 20, r16, r1, 8*4 + sha256_main_quadround 24, r16, r1, 8*4 + sha256_main_quadround 28, r16, r1, 8*4 + sha256_main_quadround 32, r16, r1, 8*4 + sha256_main_quadround 36, r16, r1, 8*4 + sha256_main_quadround 40, r16, r1, 8*4 + sha256_main_quadround 44, r16, r1, 8*4 + sha256_main_quadround 48, r16, r1, 8*4 + sha256_main_quadround 52, r16, r1, 8*4 + sha256_main_quadround 56, r16, r1, 8*4 + sha256_main_quadround 60, r16, r1, 8*4 + + lwz r12, 0*4(r3) + lwz r13, 1*4(r3) + lwz r14, 2*4(r3) + lwz r15, 3*4(r3) + add r4, r4, r12 + add r5, r5, r13 + add r6, r6, r14 + add r7, r7, r15 + stw r4, 0*4(r3) + stw r5, 1*4(r3) + stw r6, 2*4(r3) + stw r7, 3*4(r3) + lwz r12, 4*4(r3) + lwz r13, 5*4(r3) + lwz r14, 6*4(r3) + lwz r15, 7*4(r3) + add r8, r8, r12 + add r9, r9, r13 + add r10, r10, r14 + add r11, r11, r15 + stw r8, 4*4(r3) + stw r9, 5*4(r3) + stw r10, 6*4(r3) + stw r11, 7*4(r3) + + ld r13, 2*4(r1) + ld r14, 4*4(r1) + ld r15, 6*4(r1) + ld r16, 72*4(r1) + addi r1, r1, 76*4 + blr + + + .align 2 + .globl sha256d_ms + .globl _sha256d_ms + .globl .sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: +.sha256d_ms: + stdu r1, -80*4(r1) + std r13, 2*4(r1) + std r14, 4*4(r1) + std r15, 6*4(r1) + std r16, 72*4(r1) + std r17, 74*4(r1) + std r18, 76*4(r1) + + mr r17, r4 + mr r18, r5 + mr r16, r6 + + lwz r14, 3*4(r17) + lwz r6, 18*4(r17) + lwz r7, 19*4(r17) + + rotrwi r12, r14, 7 + rotrwi r13, r14, 18 + stw r6, 8*4+18*4(r1) + xor r12, r12, r13 + srwi r13, r14, 3 + stw r7, 8*4+19*4(r1) + xor r12, r12, r13 + lwz r8, 20*4(r17) + add r6, r6, r12 + lwz r10, 22*4(r17) + add r7, r7, r14 + stw r6, 18*4(r17) + + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + stw r7, 19*4(r17) + xor r12, r12, r13 + srwi r13, r6, 10 + stw r8, 8*4+20*4(r1) + xor r12, r12, r13 + lwz r4, 23*4(r17) + add r8, r8, r12 + lwz r5, 24*4(r17) + + rotrwi r9, r7, 17 + rotrwi r13, r7, 19 + stw r8, 20*4(r17) + xor r9, r9, r13 + srwi r13, r7, 10 + stw r10, 8*4+21*4(r1) + xor r9, r9, r13 + stw r4, 8*4+22*4(r1) + + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + stw r9, 21*4(r17) + xor r12, r12, r13 + srwi r13, r8, 10 + stw r5, 8*4+23*4(r1) + xor r12, r12, r13 + rotrwi r14, r9, 17 + rotrwi r13, r9, 19 + add r10, r10, r12 + lwz r11, 30*4(r17) + + xor r14, r14, r13 + srwi r13, r9, 10 + stw r10, 22*4(r17) + xor r14, r14, r13 + stw r11, 8*4+24*4(r1) + add r4, r4, r14 + + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + stw r4, 23*4(r17) + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r14, r4, 17 + xor r12, r12, r13 + rotrwi r13, r4, 19 + xor r14, r14, r13 + srwi r13, r4, 10 + add r5, r5, r12 + xor r14, r14, r13 + stw r5, 24*4(r17) + add r6, r6, r14 + + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + stw r6, 25*4(r17) + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r14, r6, 17 + xor r12, r12, r13 + rotrwi r13, r6, 19 + xor r14, r14, r13 + srwi r13, r6, 10 + add r7, r7, r12 + xor r14, r14, r13 + stw r7, 26*4(r17) + add r8, r8, r14 + + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + stw r8, 27*4(r17) + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r14, r8, 17 + xor r12, r12, r13 + rotrwi r13, r8, 19 + xor r14, r14, r13 + srwi r13, r8, 10 + add r9, r9, r12 + xor r14, r14, r13 + stw r9, 28*4(r17) + add r10, r10, r14 + + lwz r14, 31*4(r17) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + stw r10, 29*4(r17) + xor r12, r12, r13 + srwi r13, r9, 10 + stw r14, 8*4+25*4(r1) + xor r12, r12, r13 + add r11, r11, r12 + add r5, r5, r14 + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r4, r4, r11 + + lwz r11, 16*4(r17) + xor r12, r12, r13 + srwi r13, r10, 10 + stw r4, 30*4(r17) + xor r12, r12, r13 + add r5, r5, r12 + stw r5, 31*4(r17) + + sha256_extend_doubleround 16, r17, 0, r6, r7, r4, r5 + sha256_extend_doubleround 18, r17, 0, r8, r9, r6, r7 + sha256_extend_doubleround 20, r17, 0, r10, r4, r8, r9 + sha256_extend_doubleround 22, r17, 0, r5, r6, r10, r4 + sha256_extend_doubleround 24, r17, 0, r7, r8, r5, r6 + sha256_extend_doubleround 26, r17, 0, r9, r10, r7, r8 + sha256_extend_doubleround 28, r17, 0, r4, r5, r9, r10 + sha256_extend_doubleround 30, r17, 0, r6, r7, r4, r5 + sha256_extend_doubleround 32, r17, 0, r8, r9, r6, r7 + sha256_extend_doubleround 34, r17, 0, r10, r4, r8, r9 + sha256_extend_doubleround 36, r17, 0, r5, r6, r10, r4 + sha256_extend_doubleround 38, r17, 0, r7, r8, r5, r6 + sha256_extend_doubleround 40, r17, 0, r9, r10, r7, r8 + sha256_extend_doubleround 42, r17, 0, r4, r5, r9, r10 + sha256_extend_doubleround 44, r17, 0, r6, r7, r4, r5 + sha256_extend_doubleround 46, r17, 0, r8, r9, r6, r7 + + lwz r4, 0*4(r16) + lwz r9, 1*4(r16) + lwz r10, 2*4(r16) + lwz r11, 3*4(r16) + lwz r8, 4*4(r16) + lwz r5, 5*4(r16) + lwz r6, 6*4(r16) + lwz r7, 7*4(r16) +#ifdef _AIX + ld r16, T.sha256_k(r2) +#else + lis r16, HI(sha256_k) + addi r16, r16, LO(sha256_k) +#endif + + sha256_main_round 3, r16, r17, 0, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 4, r16, r17, 0 + sha256_main_quadround 8, r16, r17, 0 + sha256_main_quadround 12, r16, r17, 0 + sha256_main_quadround 16, r16, r17, 0 + sha256_main_quadround 20, r16, r17, 0 + sha256_main_quadround 24, r16, r17, 0 + sha256_main_quadround 28, r16, r17, 0 + sha256_main_quadround 32, r16, r17, 0 + sha256_main_quadround 36, r16, r17, 0 + sha256_main_quadround 40, r16, r17, 0 + sha256_main_quadround 44, r16, r17, 0 + sha256_main_quadround 48, r16, r17, 0 + sha256_main_quadround 52, r16, r17, 0 + sha256_main_quadround 56, r16, r17, 0 + sha256_main_quadround 60, r16, r17, 0 + + lwz r12, 0*4(r18) + lwz r13, 1*4(r18) + lwz r14, 2*4(r18) + lwz r15, 3*4(r18) + add r4, r4, r12 + add r5, r5, r13 + add r6, r6, r14 + add r7, r7, r15 + stw r4, 8*4+0*4(r1) + stw r5, 8*4+1*4(r1) + stw r6, 8*4+2*4(r1) + stw r7, 8*4+3*4(r1) + lwz r12, 4*4(r18) + lwz r13, 5*4(r18) + lwz r14, 6*4(r18) + lwz r15, 7*4(r18) + add r8, r8, r12 + add r9, r9, r13 + add r10, r10, r14 + add r11, r11, r15 + stw r8, 8*4+4*4(r1) + stw r9, 8*4+5*4(r1) + stw r10, 8*4+6*4(r1) + stw r11, 8*4+7*4(r1) + + lwz r4, 8*4+18*4(r1) + lwz r5, 8*4+19*4(r1) + lwz r6, 8*4+20*4(r1) + lwz r7, 8*4+21*4(r1) + lwz r8, 8*4+22*4(r1) + lwz r9, 8*4+23*4(r1) + lwz r10, 8*4+24*4(r1) + lwz r11, 8*4+25*4(r1) + stw r4, 18*4(r17) + stw r5, 19*4(r17) + stw r6, 20*4(r17) + stw r7, 22*4(r17) + stw r8, 23*4(r17) + stw r9, 24*4(r17) + stw r10, 30*4(r17) + stw r11, 31*4(r17) + + lis r8, 0x8000 + li r9, 0 + li r10, 0x0100 + + lwz r14, 8*4+1*4(r1) + lwz r4, 8*4+0*4(r1) + + lwz r11, 8*4+2*4(r1) + rotrwi r12, r14, 7 + rotrwi r13, r14, 18 + + stw r8, 8*4+8*4(r1) + stw r9, 8*4+9*4(r1) + stw r9, 8*4+10*4(r1) + stw r9, 8*4+11*4(r1) + stw r9, 8*4+12*4(r1) + stw r9, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + + xor r12, r12, r13 + srwi r13, r14, 3 + addis r5, r14, 0x00a0 + xor r12, r12, r13 + rotrwi r14, r11, 7 + rotrwi r13, r11, 18 + add r4, r4, r12 + xor r14, r14, r13 + srwi r13, r11, 3 + stw r4, 8*4+16*4(r1) + xor r14, r14, r13 + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r5, r5, r14 + lwz r14, 8*4+3*4(r1) + + stw r5, 8*4+17*4(r1) + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r6, r6, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r6, r6, r13 + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r6, r6, r11 + lwz r11, 8*4+4*4(r1) + + stw r6, 8*4+18*4(r1) + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r7, r7, r14 + lwz r14, 8*4+5*4(r1) + + stw r7, 8*4+19*4(r1) + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r8, r8, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r8, r8, r13 + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r8, r8, r11 + lwz r11, 8*4+6*4(r1) + + stw r8, 8*4+20*4(r1) + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r9, r9, r14 + lwz r14, 8*4+7*4(r1) + + stw r9, 8*4+21*4(r1) + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r10, r10, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r10, r10, r13 + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + addi r11, r11, 0x0100 + add r14, r14, r4 + add r10, r10, r11 + + xor r12, r12, r13 + srwi r13, r9, 10 + stw r10, 8*4+22*4(r1) + addis r14, r14, 0x1100 + xor r12, r12, r13 + add r14, r14, r12 + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + addi r4, r14, 0x2000 + xor r12, r12, r13 + srwi r13, r10, 10 + stw r4, 8*4+23*4(r1) + addis r5, r5, 0x8000 + xor r12, r12, r13 + add r5, r5, r12 + + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + stw r5, 8*4+24*4(r1) + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r11, r5, 17 + xor r12, r12, r13 + rotrwi r13, r5, 19 + xor r11, r11, r13 + srwi r13, r5, 10 + add r6, r6, r12 + xor r11, r11, r13 + stw r6, 8*4+25*4(r1) + add r7, r7, r11 + + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + stw r7, 8*4+26*4(r1) + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r11, r7, 17 + xor r12, r12, r13 + rotrwi r13, r7, 19 + xor r11, r11, r13 + srwi r13, r7, 10 + add r8, r8, r12 + xor r11, r11, r13 + stw r8, 8*4+27*4(r1) + add r9, r9, r11 + + rotrwi r14, r8, 17 + rotrwi r13, r8, 19 + rotrwi r12, r9, 17 + stw r9, 8*4+28*4(r1) + addis r4, r4, 0x0040 + xor r14, r14, r13 + rotrwi r13, r9, 19 + xor r12, r12, r13 + srwi r13, r8, 10 + xor r14, r14, r13 + srwi r13, r9, 10 + xor r12, r12, r13 + addi r4, r4, 0x0022 + add r10, r10, r14 + add r4, r4, r12 + lwz r11, 8*4+16*4(r1) + + addi r5, r5, 0x0100 + stw r4, 8*4+30*4(r1) + rotrwi r14, r11, 7 + stw r10, 8*4+29*4(r1) + rotrwi r13, r11, 18 + rotrwi r12, r10, 17 + xor r14, r14, r13 + rotrwi r13, r10, 19 + xor r12, r12, r13 + srwi r13, r11, 3 + xor r14, r14, r13 + srwi r13, r10, 10 + xor r12, r12, r13 + add r5, r5, r14 + add r5, r5, r12 + stw r5, 8*4+31*4(r1) + + sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10 + +#ifdef _AIX + ld r18, T.sha256_h(r2) +#else + lis r18, HI(sha256_h) + addi r18, r18, LO(sha256_h) +#endif + + lwz r14, 8*4+(44+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r15, r11, r6 + rotrwi r6, r14, 7 + rotrwi r11, r14, 18 + xor r12, r12, r13 + xor r6, r6, r11 + + lwz r8, 4*4(r18) + lwz r9, 5*4(r18) + lwz r10, 6*4(r18) + lwz r11, 7*4(r18) + + srwi r13, r4, 10 + srwi r14, r14, 3 + xor r12, r12, r13 + xor r6, r6, r14 + add r12, r12, r15 + add r6, r6, r12 + stw r6, 8*4+(44+16)*4(r1) + + lwz r4, 0*4(r18) + lwz r5, 1*4(r18) + lwz r6, 2*4(r18) + lwz r7, 3*4(r18) + + sha256_main_quadround 0, r16, r1, 8*4 + sha256_main_quadround 4, r16, r1, 8*4 + sha256_main_quadround 8, r16, r1, 8*4 + sha256_main_quadround 12, r16, r1, 8*4 + sha256_main_quadround 16, r16, r1, 8*4 + sha256_main_quadround 20, r16, r1, 8*4 + sha256_main_quadround 24, r16, r1, 8*4 + sha256_main_quadround 28, r16, r1, 8*4 + sha256_main_quadround 32, r16, r1, 8*4 + sha256_main_quadround 36, r16, r1, 8*4 + sha256_main_quadround 40, r16, r1, 8*4 + sha256_main_quadround 44, r16, r1, 8*4 + sha256_main_quadround 48, r16, r1, 8*4 + sha256_main_quadround 52, r16, r1, 8*4 + sha256_main_round 56, r16, r1, 8*4, r4, r5, r6, r7, r8, r9, r10, r11 + +.macro sha256_main_round_red i, rk, rw, wo, rd, re, rf, rg, rh + lwz r12, \wo+(\i)*4(\rw) + and r15, \rf, \re + andc r14, \rg, \re + add \rh, \rh, \rd + or r14, r14, r15 + lwz r15, (\i)*4(\rk) + rotrwi r13, \re, 5 + add \rh, \rh, r14 + xor r14, \re, r13 + rotrwi r13, \re, 19 + add \rh, \rh, r12 + xor r14, r14, r13 + add \rh, \rh, r15 + rotrwi r13, r14, 6 + add \rh, \rh, r13 +.endm + + sha256_main_round_red 57, r16, r1, 8*4, r6, r11, r8, r9, r10 + sha256_main_round_red 58, r16, r1, 8*4, r5, r10, r11, r8, r9 + sha256_main_round_red 59, r16, r1, 8*4, r4, r9, r10, r11, r8 + lwz r5, 7*4(r18) + sha256_main_round_red 60, r16, r1, 8*4, r7, r8, r9, r10, r11 + + add r11, r11, r5 + stw r11, 7*4(r3) + + ld r13, 2*4(r1) + ld r14, 4*4(r1) + ld r15, 6*4(r1) + ld r16, 72*4(r1) + ld r17, 74*4(r1) + ld r18, 76*4(r1) + addi r1, r1, 80*4 + blr + + +#ifdef __ALTIVEC__ + +#ifdef __APPLE__ + .machine ppc7400 +#endif + +#ifdef _AIX + .csect .text[RO] +#else + .data +#endif + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .align 4 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .align 4 +sha256d_4preext2: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + .align 4 +br_perm: + .long 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c + +#ifdef _AIX + .toc +T.sha256_4h: + .tc sha256_4h[TC], sha256_4h +T.sha256_4k: + .tc sha256_4k[TC], sha256_4k +T.sha256d_4preext2: + .tc sha256d_4preext2[TC], sha256d_4preext2 +T.br_perm: + .tc br_perm[TC], br_perm +#endif + + +.macro sha256_4way_extend_setup + vspltisw v0, 10 + vspltisw v1, -7 + vspltisw v16, 3 + vspltisw v17, 15 + vspltisw v18, 14 + vspltisw v19, 13 +.endm + +.macro sha256_4way_extend_doubleround i, rw, va, vb, vy, vz + lvx v14, \rw, r7 + vrlw v12, \vy, v17 + vrlw v13, \vy, v19 + vadduwm v11, v11, \va + vxor v12, v12, v13 + vsrw v13, \vy, v0 + vrlw \va, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor \va, \va, v13 + vsrw v13, v14, v16 + lvx v11, \rw, r8 + vxor \va, \va, v13 + vrlw v13, \vz, v19 + vadduwm \va, \va, v12 + + vrlw v12, \vz, v17 + vadduwm v14, v14, \vb + vxor v12, v12, v13 + vsrw v13, \vz, v0 + vrlw \vb, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx \va, \rw, r10 + vxor \vb, \vb, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor \vb, \vb, v13 + vadduwm \vb, \vb, v14 + stvx \vb, \rw, r11 + addi \rw, \rw, 2*16 +.endm + + +.macro sha256_4way_main_setup + vspltisw v2, 12 + vspltisw v3, -5 + vspltisw v16, -6 + vspltisw v17, -11 + vspltisw v18, -2 +.endm + +.macro sha256_4way_main_round i, rk, rw, va, vb, vc, vd, ve, vf, vg, vh + li r6, (\i)*16 + lvx v12, \rw, r6 + vand v13, \vf, \ve + vandc v14, \vg, \ve + lvx v15, \rk, r6 + vor v14, v14, v13 + vrlw v13, \ve, v3 + vadduwm \vh, \vh, v14 + vxor v14, \ve, v13 + vrlw v13, \ve, v19 + vadduwm \vh, \vh, v12 + vxor v14, v14, v13 + vadduwm \vh, \vh, v15 + vrlw v13, v14, v16 + vxor v15, \va, \vb + vadduwm \vh, \vh, v13 + + vrlw v13, \va, v17 + vand v15, v15, \vc + vxor v12, \va, v13 + vrlw v13, \va, v2 + vand v14, \va, \vb + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, \vh, v14 + vadduwm \vh, \vh, \vd + vadduwm \vd, v15, v13 +.endm + +.macro sha256_4way_main_quadround i, rk, rw + sha256_4way_main_round \i+0, \rk, \rw, v4, v5, v6, v7, v8, v9, v10, v11 + sha256_4way_main_round \i+1, \rk, \rw, v7, v4, v5, v6, v11, v8, v9, v10 + sha256_4way_main_round \i+2, \rk, \rw, v6, v7, v4, v5, v10, v11, v8, v9 + sha256_4way_main_round \i+3, \rk, \rw, v5, v6, v7, v4, v9, v10, v11, v8 +.endm + + +#ifdef _AIX + .csect .text[PR] +#else + .text +#endif + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way + .globl .sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: +.sha256_init_4way: + mfspr r0, 256 + oris r12, r0, 0xff00 + mtspr 256, r12 + +#ifdef _AIX + ld r4, T.sha256_4h(r2) +#else + lis r4, HI(sha256_4h) + addi r4, r4, LO(sha256_4h) +#endif + li r5, 1*16 + li r6, 2*16 + li r7, 3*16 + li r8, 4*16 + li r9, 5*16 + li r10, 6*16 + li r11, 7*16 + lvx v0, 0, r4 + lvx v1, r4, r5 + lvx v2, r4, r6 + lvx v3, r4, r7 + lvx v4, r4, r8 + lvx v5, r4, r9 + lvx v6, r4, r10 + lvx v7, r4, r11 + stvx v0, 0, r3 + stvx v1, r3, r5 + stvx v2, r3, r6 + stvx v3, r3, r7 + stvx v4, r3, r8 + stvx v5, r3, r9 + stvx v6, r3, r10 + stvx v7, r3, r11 + + mtspr 256, r0 + blr + + + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way + .globl .sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: +.sha256_transform_4way: + mfspr r0, 256 + oris r12, r0, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 + + andi. r6, r1, 15 + cmpwi 0, r5, 0 + li r7, -(4*4+64*16) + subf r6, r6, r7 + stdux r1, r1, r6 + + li r7, 1*16 + li r8, 2*16 + li r9, 3*16 + li r10, 4*16 + li r11, 5*16 + li r12, 6*16 + li r6, 7*16 + + bne 0, sha256_transform_4way_swap + + lvx v11, 0, r4 + lvx v1, r4, r7 + lvx v2, r4, r8 + lvx v3, r4, r9 + lvx v4, r4, r10 + lvx v5, r4, r11 + lvx v6, r4, r12 + lvx v7, r4, r6 + addi r5, r1, 4*4 + stvx v11, 0, r5 + stvx v1, r5, r7 + stvx v2, r5, r8 + stvx v3, r5, r9 + stvx v4, r5, r10 + stvx v5, r5, r11 + stvx v6, r5, r12 + stvx v7, r5, r6 + addi r4, r4, 8*16 + lvx v0, 0, r4 + lvx v4, r4, r7 + lvx v5, r4, r8 + lvx v6, r4, r9 + lvx v7, r4, r10 + lvx v8, r4, r11 + lvx v9, r4, r12 + lvx v10, r4, r6 + addi r4, r1, 4*4+8*16 + stvx v0, 0, r4 + stvx v4, r4, r7 + stvx v5, r4, r8 + stvx v6, r4, r9 + stvx v7, r4, r10 + stvx v8, r4, r11 + stvx v9, r4, r12 + stvx v10, r4, r6 + b sha256_transform_4way_extend + +sha256_transform_4way_swap: +#ifdef _AIX + ld r5, T.br_perm(r2) +#else + lis r5, HI(br_perm) + addi r5, r5, LO(br_perm) +#endif + lvx v19, 0, r5 + + lvx v11, 0, r4 + lvx v1, r4, r7 + lvx v2, r4, r8 + lvx v3, r4, r9 + lvx v4, r4, r10 + lvx v5, r4, r11 + lvx v6, r4, r12 + lvx v7, r4, r6 + vperm v11, v11, v11, v19 + vperm v1, v1, v1, v19 + vperm v2, v2, v2, v19 + vperm v3, v3, v3, v19 + vperm v4, v4, v4, v19 + vperm v5, v5, v5, v19 + vperm v6, v6, v6, v19 + vperm v7, v7, v7, v19 + addi r5, r1, 4*4 + stvx v11, 0, r5 + stvx v1, r5, r7 + stvx v2, r5, r8 + stvx v3, r5, r9 + stvx v4, r5, r10 + stvx v5, r5, r11 + stvx v6, r5, r12 + stvx v7, r5, r6 + addi r4, r4, 8*16 + lvx v0, 0, r4 + lvx v4, r4, r7 + lvx v5, r4, r8 + lvx v6, r4, r9 + lvx v7, r4, r10 + lvx v8, r4, r11 + lvx v9, r4, r12 + lvx v10, r4, r6 + vperm v0, v0, v0, v19 + vperm v4, v4, v4, v19 + vperm v5, v5, v5, v19 + vperm v6, v6, v6, v19 + vperm v7, v7, v7, v19 + vperm v8, v8, v8, v19 + vperm v9, v9, v9, v19 + vperm v10, v10, v10, v19 + addi r4, r1, 4*4+8*16 + stvx v0, 0, r4 + stvx v4, r4, r7 + stvx v5, r4, r8 + stvx v6, r4, r9 + stvx v7, r4, r10 + stvx v8, r4, r11 + stvx v9, r4, r12 + stvx v10, r4, r6 + +sha256_transform_4way_extend: + li r10, 16*16 + li r11, 17*16 + sha256_4way_extend_setup + sha256_4way_extend_doubleround 0, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 2, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 4, r5, v8, v9, v6, v7 + sha256_4way_extend_doubleround 6, r5, v10, v4, v8, v9 + sha256_4way_extend_doubleround 8, r5, v5, v6, v10, v4 + sha256_4way_extend_doubleround 10, r5, v7, v8, v5, v6 + sha256_4way_extend_doubleround 12, r5, v9, v10, v7, v8 + sha256_4way_extend_doubleround 14, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 16, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 18, r5, v8, v9, v6, v7 + sha256_4way_extend_doubleround 20, r5, v10, v4, v8, v9 + sha256_4way_extend_doubleround 22, r5, v5, v6, v10, v4 + sha256_4way_extend_doubleround 24, r5, v7, v8, v5, v6 + sha256_4way_extend_doubleround 26, r5, v9, v10, v7, v8 + sha256_4way_extend_doubleround 28, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 30, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 32, r5, v8, v9, v6, v7 + sha256_4way_extend_doubleround 34, r5, v10, v4, v8, v9 + sha256_4way_extend_doubleround 36, r5, v5, v6, v10, v4 + sha256_4way_extend_doubleround 38, r5, v7, v8, v5, v6 + sha256_4way_extend_doubleround 40, r5, v9, v10, v7, v8 + sha256_4way_extend_doubleround 42, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 44, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 46, r5, v8, v9, v6, v7 + + addi r11, r3, 4*16 + lvx v4, 0, r3 + lvx v5, r3, r7 + lvx v6, r3, r8 + lvx v7, r3, r9 + lvx v8, 0, r11 + lvx v9, r11, r7 + lvx v10, r11, r8 + lvx v11, r11, r9 +#ifdef _AIX + ld r12, T.sha256_4k(r2) +#else + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) +#endif + addi r5, r1, 4*4 + sha256_4way_main_setup + sha256_4way_main_quadround 0, r12, r5 + sha256_4way_main_quadround 4, r12, r5 + sha256_4way_main_quadround 8, r12, r5 + sha256_4way_main_quadround 12, r12, r5 + sha256_4way_main_quadround 16, r12, r5 + sha256_4way_main_quadround 20, r12, r5 + sha256_4way_main_quadround 24, r12, r5 + sha256_4way_main_quadround 28, r12, r5 + sha256_4way_main_quadround 32, r12, r5 + sha256_4way_main_quadround 36, r12, r5 + sha256_4way_main_quadround 40, r12, r5 + sha256_4way_main_quadround 44, r12, r5 + sha256_4way_main_quadround 48, r12, r5 + sha256_4way_main_quadround 52, r12, r5 + sha256_4way_main_quadround 56, r12, r5 + sha256_4way_main_quadround 60, r12, r5 + + lvx v12, 0, r3 + lvx v13, r3, r7 + lvx v14, r3, r8 + lvx v15, r3, r9 + lvx v16, 0, r11 + lvx v17, r11, r7 + lvx v18, r11, r8 + lvx v19, r11, r9 + vadduwm v4, v4, v12 + vadduwm v5, v5, v13 + vadduwm v6, v6, v14 + vadduwm v7, v7, v15 + vadduwm v8, v8, v16 + vadduwm v9, v9, v17 + vadduwm v10, v10, v18 + vadduwm v11, v11, v19 + stvx v4, 0, r3 + stvx v5, r3, r7 + stvx v6, r3, r8 + stvx v7, r3, r9 + stvx v8, 0, r11 + stvx v9, r11, r7 + stvx v10, r11, r8 + stvx v11, r11, r9 + + ld r1, 0(r1) + mtspr 256, r0 + blr + + + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way + .globl .sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: +.sha256d_ms_4way: + mfspr r0, 256 + oris r12, r0, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 + + andi. r12, r1, 15 + li r11, -(4*4+64*16) + subf r12, r12, r11 + stdux r1, r1, r12 + + li r7, 1*16 + li r8, 2*16 + li r9, 3*16 + li r10, 16*16 + li r11, 17*16 + + sha256_4way_extend_setup + + addi r4, r4, 2*16 + addi r12, r1, 4*4+18*16 + lvx v14, r4, r7 + lvx v6, r4, r10 + lvx v7, r4, r11 + + vrlw v12, v14, v1 + vrlw v13, v14, v18 + stvx v6, 0, r12 + vxor v12, v12, v13 + vsrw v13, v14, v16 + stvx v7, r12, r7 + vxor v12, v12, v13 + vadduwm v6, v6, v12 + vadduwm v7, v7, v14 + stvx v6, r4, r10 + + vrlw v12, v6, v17 + vrlw v13, v6, v19 + stvx v7, r4, r11 + addi r4, r4, 18*16 + lvx v8, 0, r4 + vxor v12, v12, v13 + vsrw v13, v6, v0 + stvx v8, r12, r8 + vxor v12, v12, v13 + vadduwm v8, v8, v12 + + vrlw v9, v7, v17 + vrlw v13, v7, v19 + stvx v8, 0, r4 + vxor v9, v9, v13 + vsrw v13, v7, v0 + vxor v9, v9, v13 + + vrlw v12, v8, v17 + vrlw v13, v8, v19 + stvx v9, r4, r7 + vxor v12, v12, v13 + vsrw v13, v8, v0 + lvx v10, r4, r8 + lvx v4, r4, r9 + vxor v12, v12, v13 + stvx v10, r12, r9 + addi r12, r12, 4*16 + stvx v4, 0, r12 + vrlw v14, v9, v17 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vxor v14, v14, v13 + vsrw v13, v9, v0 + stvx v10, r4, r8 + vxor v14, v14, v13 + vadduwm v4, v4, v14 + + vrlw v12, v10, v17 + vrlw v13, v10, v19 + stvx v4, r4, r9 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v14, v4, v17 + vxor v12, v12, v13 + vrlw v13, v4, v19 + addi r4, r4, 4*16 + lvx v5, 0, r4 + vxor v14, v14, v13 + stvx v5, r12, r7 + vsrw v13, v4, v0 + vadduwm v5, v5, v12 + vxor v14, v14, v13 + stvx v5, 0, r4 + vadduwm v6, v6, v14 + + vrlw v12, v5, v17 + vrlw v13, v5, v19 + stvx v6, r4, r7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v14, v6, v17 + vxor v12, v12, v13 + vrlw v13, v6, v19 + vxor v14, v14, v13 + vsrw v13, v6, v0 + vadduwm v7, v7, v12 + vxor v14, v14, v13 + stvx v7, r4, r8 + vadduwm v8, v8, v14 + + vrlw v12, v7, v17 + vrlw v13, v7, v19 + stvx v8, r4, r9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v14, v8, v17 + vxor v12, v12, v13 + vrlw v13, v8, v19 + vxor v14, v14, v13 + vsrw v13, v8, v0 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + addi r4, r4, 4*16 + stvx v9, 0, r4 + vadduwm v10, v10, v14 + + vrlw v12, v9, v17 + vrlw v13, v9, v19 + stvx v10, r4, r7 + vxor v12, v12, v13 + vsrw v13, v9, v0 + lvx v11, r4, r8 + lvx v14, r4, r9 + stvx v11, r12, r8 + stvx v14, r12, r9 + vxor v12, v12, v13 + vadduwm v11, v11, v12 + vadduwm v5, v5, v14 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v4, v4, v11 + + vxor v12, v12, v13 + vsrw v13, v10, v0 + stvx v4, r4, r8 + vxor v12, v12, v13 + vadduwm v5, v5, v12 + stvx v5, r4, r9 + addi r4, r4, -12*16 + lvx v11, 0, r4 + + sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10 + sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10 + sha256_4way_extend_doubleround 44, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 46, r4, v8, v9, v6, v7 + addi r4, r4, -48*16 + + lvx v4, 0, r6 + lvx v9, r6, r7 + lvx v10, r6, r8 + lvx v11, r6, r9 + addi r12, r6, 4*16 + lvx v8, 0, r12 + lvx v5, r12, r7 + lvx v6, r12, r8 + lvx v7, r12, r9 +#ifdef _AIX + ld r12, T.sha256_4k(r2) +#else + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) +#endif + sha256_4way_main_setup + sha256_4way_main_round 3, r12, r4, v5, v6, v7, v4, v9, v10, v11, v8 + sha256_4way_main_quadround 4, r12, r4 + sha256_4way_main_quadround 8, r12, r4 + sha256_4way_main_quadround 12, r12, r4 + sha256_4way_main_quadround 16, r12, r4 + sha256_4way_main_quadround 20, r12, r4 + sha256_4way_main_quadround 24, r12, r4 + sha256_4way_main_quadround 28, r12, r4 + sha256_4way_main_quadround 32, r12, r4 + sha256_4way_main_quadround 36, r12, r4 + sha256_4way_main_quadround 40, r12, r4 + sha256_4way_main_quadround 44, r12, r4 + sha256_4way_main_quadround 48, r12, r4 + sha256_4way_main_quadround 52, r12, r4 + sha256_4way_main_quadround 56, r12, r4 + sha256_4way_main_quadround 60, r12, r4 + + lvx v12, 0, r5 + lvx v13, r5, r7 + lvx v14, r5, r8 + lvx v15, r5, r9 + addi r12, r5, 4*16 + lvx v16, 0, r12 + lvx v17, r12, r7 + lvx v18, r12, r8 + lvx v19, r12, r9 + vadduwm v4, v4, v12 + vadduwm v5, v5, v13 + vadduwm v6, v6, v14 + vadduwm v7, v7, v15 + vadduwm v8, v8, v16 + vadduwm v9, v9, v17 + vadduwm v10, v10, v18 + vadduwm v11, v11, v19 + addi r12, r1, 4*4 + stvx v4, 0, r12 + stvx v5, r12, r7 + stvx v6, r12, r8 + stvx v7, r12, r9 + addi r12, r12, 4*16 + stvx v8, 0, r12 + stvx v9, r12, r7 + stvx v10, r12, r8 + stvx v11, r12, r9 + + addi r12, r1, 4*4+18*16 + lvx v4, 0, r12 + lvx v5, r12, r7 + lvx v6, r12, r8 + lvx v7, r12, r9 + addi r12, r12, 4*16 + lvx v8, 0, r12 + lvx v9, r12, r7 + lvx v10, r12, r8 + lvx v11, r12, r9 + addi r12, r4, 18*16 + stvx v4, 0, r12 + stvx v5, r12, r7 + stvx v6, r12, r8 + addi r12, r4, 22*16 + stvx v7, 0, r12 + stvx v8, r12, r7 + stvx v9, r12, r8 + addi r12, r4, 30*16 + stvx v10, 0, r12 + stvx v11, r12, r7 + + addi r4, r1, 4*4 + + sha256_4way_extend_setup + +#ifdef _AIX + ld r12, T.sha256d_4preext2(r2) +#else + lis r12, HI(sha256d_4preext2) + addi r12, r12, LO(sha256d_4preext2) +#endif + lvx v2, 0, r12 + + vxor v9, v9, v9 + vspltisw v3, 1 + lvx v4, r12, r8 + vsldoi v3, v3, v3, 1 + addi r5, r1, 4*4+8*16 + stvx v4, 0, r5 + stvx v9, r5, r7 + stvx v9, r5, r8 + stvx v9, r5, r9 + addi r5, r5, 4*16 + stvx v9, 0, r5 + stvx v9, r5, r7 + stvx v9, r5, r8 + stvx v3, r5, r9 + + lvx v4, 0, r4 + lvx v14, r4, r7 + + lvx v11, r4, r8 + vrlw v12, v14, v1 + vrlw v13, v14, v18 + + vxor v12, v12, v13 + vsrw v13, v14, v16 + vadduwm v5, v14, v2 + vxor v12, v12, v13 + vrlw v14, v11, v1 + vrlw v13, v11, v18 + vadduwm v4, v4, v12 + vxor v14, v14, v13 + vsrw v13, v11, v16 + stvx v4, r4, r10 + vxor v14, v14, v13 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v5, v5, v14 + + stvx v5, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v6, v6, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v6, v6, v13 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v6, v6, v11 + lvx v11, r4, r8 + + stvx v6, r4, r10 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v7, v7, v14 + + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v8, v8, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v8, v8, v13 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v8, v8, v11 + lvx v11, r4, r8 + + stvx v8, r4, r10 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v9, v9, v14 + + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v10, v10, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v10, v10, v13 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v3 + vadduwm v14, v14, v4 + vadduwm v10, v10, v11 + + lvx v2, r12, r7 + vxor v12, v12, v13 + vsrw v13, v9, v0 + stvx v10, r4, r10 + vxor v12, v12, v13 + vadduwm v14, v14, v12 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v4, v14, v2 + lvx v2, r12, r8 + vxor v12, v12, v13 + vsrw v13, v10, v0 + stvx v4, r4, r11 + vadduwm v5, v5, v2 + vxor v12, v12, v13 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vrlw v13, v4, v19 + addi r4, r4, 2*16 + stvx v5, r4, r10 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v11, v5, v17 + vxor v12, v12, v13 + vrlw v13, v5, v19 + vxor v11, v11, v13 + vsrw v13, v5, v0 + vadduwm v6, v6, v12 + vxor v11, v11, v13 + stvx v6, r4, r11 + vadduwm v7, v7, v11 + + vrlw v12, v6, v17 + vrlw v13, v6, v19 + addi r4, r4, 2*16 + stvx v7, r4, r10 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v11, v7, v17 + vxor v12, v12, v13 + vrlw v13, v7, v19 + vxor v11, v11, v13 + vsrw v13, v7, v0 + vadduwm v8, v8, v12 + vxor v11, v11, v13 + stvx v8, r4, r11 + vadduwm v9, v9, v11 + + lvx v2, r12, r9 + vrlw v14, v8, v17 + vrlw v13, v8, v19 + vrlw v12, v9, v17 + addi r4, r4, 2*16 + stvx v9, r4, r10 + vxor v14, v14, v13 + vrlw v13, v9, v19 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vxor v14, v14, v13 + vsrw v13, v9, v0 + vxor v12, v12, v13 + vadduwm v4, v4, v2 + vadduwm v10, v10, v14 + vadduwm v4, v4, v12 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v11, r4, r8 + + vadduwm v5, v5, v3 + stvx v4, r4, r10 + vrlw v14, v11, v1 + vrlw v13, v11, v18 + vrlw v12, v10, v17 + vxor v14, v14, v13 + vrlw v13, v10, v19 + vxor v12, v12, v13 + vsrw v13, v11, v16 + vxor v14, v14, v13 + vsrw v13, v10, v0 + vxor v12, v12, v13 + vadduwm v5, v5, v14 + vadduwm v5, v5, v12 + stvx v5, r4, r11 + addi r4, r4, 2*16 + + sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10 + sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10 + + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v15, v11, v6 + vrlw v6, v14, v1 + vrlw v11, v14, v18 + vxor v12, v12, v13 + vxor v6, v6, v11 + vsrw v13, v4, v0 + vsrw v14, v14, v16 + vxor v12, v12, v13 + vxor v6, v6, v14 + vadduwm v12, v12, v15 + vadduwm v6, v6, v12 + stvx v6, r4, r10 + addi r4, r4, -44*16 + +#ifdef _AIX + ld r5, T.sha256_4h(r2) +#else + lis r5, HI(sha256_4h) + addi r5, r5, LO(sha256_4h) +#endif + lvx v4, 0, r5 + lvx v5, r5, r7 + lvx v6, r5, r8 + lvx v7, r5, r9 + addi r12, r5, 4*16 + lvx v8, 0, r12 + lvx v9, r12, r7 + lvx v10, r12, r8 + lvx v11, r12, r9 +#ifdef _AIX + ld r12, T.sha256_4k(r2) +#else + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) +#endif + sha256_4way_main_setup + sha256_4way_main_quadround 0, r12, r4 + sha256_4way_main_quadround 4, r12, r4 + sha256_4way_main_quadround 8, r12, r4 + sha256_4way_main_quadround 12, r12, r4 + sha256_4way_main_quadround 16, r12, r4 + sha256_4way_main_quadround 20, r12, r4 + sha256_4way_main_quadround 24, r12, r4 + sha256_4way_main_quadround 28, r12, r4 + sha256_4way_main_quadround 32, r12, r4 + sha256_4way_main_quadround 36, r12, r4 + sha256_4way_main_quadround 40, r12, r4 + sha256_4way_main_quadround 44, r12, r4 + sha256_4way_main_quadround 48, r12, r4 + sha256_4way_main_quadround 52, r12, r4 + sha256_4way_main_round 56, r12, r4, v4, v5, v6, v7, v8, v9, v10, v11 + +.macro sha256_4way_main_round_red i, rk, rw, vd, ve, vf, vg, vh + li r6, (\i)*16 + vand v15, \vf, \ve + vandc v14, \vg, \ve + lvx v12, \rw, r6 + vadduwm \vh, \vh, \vd + vor v14, v14, v15 + lvx v15, \rk, r6 + vrlw v13, \ve, v3 + vadduwm \vh, \vh, v14 + vxor v14, \ve, v13 + vrlw v13, \ve, v19 + vadduwm \vh, \vh, v12 + vxor v14, v14, v13 + vadduwm \vh, \vh, v15 + vrlw v13, v14, v16 + vadduwm \vh, \vh, v13 +.endm + + sha256_4way_main_round_red 57, r12, r4, v6, v11, v8, v9, v10 + sha256_4way_main_round_red 58, r12, r4, v5, v10, v11, v8, v9 + sha256_4way_main_round_red 59, r12, r4, v4, v9, v10, v11, v8 + sha256_4way_main_round_red 60, r12, r4, v7, v8, v9, v10, v11 + + li r12, 7*16 + lvx v19, r5, r12 + vadduwm v11, v11, v19 + stvx v11, r3, r12 + + ld r1, 0(r1) + mtspr 256, r0 + blr + + + .align 2 + .globl sha256_use_4way + .globl _sha256_use_4way + .globl .sha256_use_4way +#ifdef __ELF__ + .type sha256_use_4way, %function +#endif +sha256_use_4way: +_sha256_use_4way: +.sha256_use_4way: + li r3, 1 + blr + +#endif /* __ALTIVEC__ */ + +#endif diff --git a/sha2-x64.S b/sha2-x64.S index 770d3ba29..7db31e2bf 100644 --- a/sha2-x64.S +++ b/sha2-x64.S @@ -45,201 +45,7 @@ bswap_xmm_mask: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f -.macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3 - movdqa \x3, %xmm4 - movl \re, %eax - movdqa \x2, %xmm6 - rorl $(25-11), %eax - movl \ra, %ebx - pslldq $12, %xmm4 - rorl $(22-13), %ebx - psrldq $4, %xmm6 - xorl \re, %eax - movl \rf, %ecx - rorl $(11-6), %eax - pxor %xmm6, %xmm4 - movdqa \x1, %xmm5 - xorl \ra, %ebx - xorl \rg, %ecx - xorl \re, %eax - paddd \x0, %xmm4 - movdqa \x0, %xmm7 - andl \re, %ecx - rorl $(13-2), %ebx - xorl \ra, %ebx - pslldq $12, %xmm5 - psrldq $4, %xmm7 - rorl $6, %eax - xorl \rg, %ecx - pxor %xmm7, %xmm5 - rorl $2, %ebx - addl %eax, %ecx - addl (%rsp) , %ecx - movdqa %xmm5, %xmm6 - movl \ra, %eax - addl %ecx, \rh - movl \ra, %ecx - movdqa %xmm5, %xmm7 - orl \rc, %eax - addl \rh, \rd - andl \rc, %ecx - pslld $(32-7), %xmm5 - psrld $7, %xmm6 - andl \rb, %eax - addl %ebx, \rh - orl %ecx, %eax - por %xmm6, %xmm5 - addl %eax, \rh - - movl \rd, %eax - movdqa %xmm7, %xmm6 - movl \rh, %ebx - rorl $(25-11), %eax - xorl \rd, %eax - movdqa %xmm7, %xmm8 - movl \re, %ecx - rorl $(22-13), %ebx - xorl \rh, %ebx - pslld $(32-18), %xmm7 - rorl $(11-6), %eax - xorl \rf, %ecx - rorl $(13-2), %ebx - psrld $18, %xmm6 - xorl \rd, %eax - andl \rd, %ecx - rorl $6, %eax - pxor %xmm7, %xmm5 - xorl \rh, %ebx - xorl \rf, %ecx - psrld $3, %xmm8 - addl %eax, %ecx - addl 1*4(%rsp), %ecx - rorl $2, %ebx - pxor %xmm6, %xmm5 - movl \rh, %eax - addl %ecx, \rg - movl \rh, %ecx - pxor %xmm8, %xmm5 - orl \rb, %eax - addl \rg, \rc - andl \rb, %ecx - pshufd $0xfa, \x3, %xmm6 - andl \ra, %eax - addl %ebx, \rg - paddd %xmm5, %xmm4 - orl %ecx, %eax - addl %eax, \rg - - movl \rc, %eax - movdqa %xmm6, %xmm7 - movl \rg, %ebx - rorl $(25-11), %eax - xorl \rc, %eax - movdqa %xmm6, %xmm8 - rorl $(22-13), %ebx - movl \rd, %ecx - xorl \rg, %ebx - psrlq $17, %xmm6 - psrlq $19, %xmm7 - rorl $(11-6), %eax - xorl \re, %ecx - xorl \rc, %eax - psrld $10, %xmm8 - pxor %xmm7, %xmm6 - andl \rc, %ecx - rorl $(13-2), %ebx - xorl \rg, %ebx - pxor %xmm6, %xmm8 - xorl \re, %ecx - rorl $6, %eax - addl %eax, %ecx - pshufd $0x8f, %xmm8, %xmm8 - rorl $2, %ebx - addl 2*4(%rsp), %ecx - movl \rg, %eax - psrldq $8, %xmm8 - addl %ecx, \rf - movl \rg, %ecx - orl \ra, %eax - paddd %xmm8, %xmm4 - addl \rf, \rb - andl \ra, %ecx - andl \rh, %eax - pshufd $0x50, %xmm4, %xmm6 - addl %ebx, \rf - orl %ecx, %eax - addl %eax, \rf - - movdqa %xmm6, %xmm7 - movl \rb, %eax - rorl $(25-11), %eax - movl \rf, %ebx - movdqa %xmm6, \x0 - rorl $(22-13), %ebx - xorl \rb, %eax - movl \rc, %ecx - psrlq $17, %xmm6 - rorl $(11-6), %eax - xorl \rf, %ebx - xorl \rd, %ecx - psrlq $19, %xmm7 - xorl \rb, %eax - andl \rb, %ecx - rorl $(13-2), %ebx - psrld $10, \x0 - xorl \rf, %ebx - rorl $6, %eax - pxor %xmm7, %xmm6 - xorl \rd, %ecx - rorl $2, %ebx - addl %eax, %ecx - pxor %xmm6, \x0 - addl 3*4(%rsp), %ecx - movl \rf, %eax - addl %ecx, \re - pshufd $0xf8, \x0, \x0 - movl \rf, %ecx - orl \rh, %eax - addl \re, \ra - pslldq $8, \x0 - andl \rh, %ecx - andl \rg, %eax - paddd %xmm4, \x0 - addl %ebx, \re - orl %ecx, %eax - addl %eax, \re -.endm -.macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh - movl \re, %eax - rorl $(25-11), %eax - movl \ra, %ebx - xorl \re, %eax - rorl $(22-13), %ebx - movl \rf, %ecx - xorl \ra, %ebx - rorl $(11-6), %eax - xorl \rg, %ecx - xorl \re, %eax - rorl $(13-2), %ebx - andl \re, %ecx - xorl \ra, %ebx - rorl $6, %eax - xorl \rg, %ecx - addl %eax, %ecx - rorl $2, %ebx - addl \i*4(%rsp), %ecx - movl \ra, %eax - addl %ecx, \rh - movl \ra, %ecx - orl \rc, %eax - addl \rh, \rd - andl \rc, %ecx - andl \rb, %eax - addl %ebx, \rh - orl %ecx, %eax - addl %eax, \rh -.endm .text @@ -321,183 +127,1156 @@ sha256_transform_sse2_loop: movdqa 0*16(%rdx), %xmm9 paddd %xmm0, %xmm9 movdqa %xmm9, (%rsp) - sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movl %r12d, %eax + movdqa %xmm2, %xmm6 + rorl $(25-11), %eax + movl %r8d, %ebx + pslldq $12, %xmm4 + rorl $(22-13), %ebx + psrldq $4, %xmm6 + xorl %r12d, %eax + movl %r13d, %ecx + rorl $(11-6), %eax + pxor %xmm6, %xmm4 + movdqa %xmm1, %xmm5 + xorl %r8d, %ebx + xorl %r14d, %ecx + xorl %r12d, %eax + paddd %xmm0, %xmm4 + movdqa %xmm0, %xmm7 + andl %r12d, %ecx + rorl $(13-2), %ebx + xorl %r8d, %ebx + pslldq $12, %xmm5 + psrldq $4, %xmm7 + rorl $6, %eax + xorl %r14d, %ecx + pxor %xmm7, %xmm5 + rorl $2, %ebx + addl %eax, %ecx + addl (%rsp) , %ecx + movdqa %xmm5, %xmm6 + movl %r8d, %eax + addl %ecx, %r15d + movl %r8d, %ecx + movdqa %xmm5, %xmm7 + orl %r10d, %eax + addl %r15d, %r11d + andl %r10d, %ecx + pslld $(32-7), %xmm5 + psrld $7, %xmm6 + andl %r9d, %eax + addl %ebx, %r15d + orl %ecx, %eax + por %xmm6, %xmm5 + addl %eax, %r15d + + movl %r11d, %eax + movdqa %xmm7, %xmm6 + movl %r15d, %ebx + rorl $(25-11), %eax + xorl %r11d, %eax + movdqa %xmm7, %xmm8 + movl %r12d, %ecx + rorl $(22-13), %ebx + xorl %r15d, %ebx + pslld $(32-18), %xmm7 + rorl $(11-6), %eax + xorl %r13d, %ecx + rorl $(13-2), %ebx + psrld $18, %xmm6 + xorl %r11d, %eax + andl %r11d, %ecx + rorl $6, %eax + pxor %xmm7, %xmm5 + xorl %r15d, %ebx + xorl %r13d, %ecx + psrld $3, %xmm8 + addl %eax, %ecx + addl 1*4(%rsp), %ecx + rorl $2, %ebx + pxor %xmm6, %xmm5 + movl %r15d, %eax + addl %ecx, %r14d + movl %r15d, %ecx + pxor %xmm8, %xmm5 + orl %r9d, %eax + addl %r14d, %r10d + andl %r9d, %ecx + pshufd $0xfa, %xmm3, %xmm6 + andl %r8d, %eax + addl %ebx, %r14d + paddd %xmm5, %xmm4 + orl %ecx, %eax + addl %eax, %r14d + + movl %r10d, %eax + movdqa %xmm6, %xmm7 + movl %r14d, %ebx + rorl $(25-11), %eax + xorl %r10d, %eax + movdqa %xmm6, %xmm8 + rorl $(22-13), %ebx + movl %r11d, %ecx + xorl %r14d, %ebx + psrlq $17, %xmm6 + psrlq $19, %xmm7 + rorl $(11-6), %eax + xorl %r12d, %ecx + xorl %r10d, %eax + psrld $10, %xmm8 + pxor %xmm7, %xmm6 + andl %r10d, %ecx + rorl $(13-2), %ebx + xorl %r14d, %ebx + pxor %xmm6, %xmm8 + xorl %r12d, %ecx + rorl $6, %eax + addl %eax, %ecx + pshufd $0x8f, %xmm8, %xmm8 + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r14d, %eax + psrldq $8, %xmm8 + addl %ecx, %r13d + movl %r14d, %ecx + orl %r8d, %eax + paddd %xmm8, %xmm4 + addl %r13d, %r9d + andl %r8d, %ecx + andl %r15d, %eax + pshufd $0x50, %xmm4, %xmm6 + addl %ebx, %r13d + orl %ecx, %eax + addl %eax, %r13d + + movdqa %xmm6, %xmm7 + movl %r9d, %eax + rorl $(25-11), %eax + movl %r13d, %ebx + movdqa %xmm6, %xmm0 + rorl $(22-13), %ebx + xorl %r9d, %eax + movl %r10d, %ecx + psrlq $17, %xmm6 + rorl $(11-6), %eax + xorl %r13d, %ebx + xorl %r11d, %ecx + psrlq $19, %xmm7 + xorl %r9d, %eax + andl %r9d, %ecx + rorl $(13-2), %ebx + psrld $10, %xmm0 + xorl %r13d, %ebx + rorl $6, %eax + pxor %xmm7, %xmm6 + xorl %r11d, %ecx + rorl $2, %ebx + addl %eax, %ecx + pxor %xmm6, %xmm0 + addl 3*4(%rsp), %ecx + movl %r13d, %eax + addl %ecx, %r12d + pshufd $0xf8, %xmm0, %xmm0 + movl %r13d, %ecx + orl %r15d, %eax + addl %r12d, %r8d + pslldq $8, %xmm0 + andl %r15d, %ecx + andl %r14d, %eax + paddd %xmm4, %xmm0 + addl %ebx, %r12d + orl %ecx, %eax + addl %eax, %r12d movdqa 1*16(%rdx), %xmm9 paddd %xmm1, %xmm9 movdqa %xmm9, (%rsp) - sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0 - movdqa 2*16(%rdx), %xmm9 - paddd %xmm2, %xmm9 - movdqa %xmm9, (%rsp) - sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1 - movdqa 3*16(%rdx), %xmm9 - paddd %xmm3, %xmm9 - movdqa %xmm9, (%rsp) - addq $4*16, %rdx - sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2 - - subq $16, %rsi - jne sha256_transform_sse2_loop + movdqa %xmm0, %xmm4 + movl %r8d, %eax + movdqa %xmm3, %xmm6 + rorl $(25-11), %eax + movl %r12d, %ebx + pslldq $12, %xmm4 + rorl $(22-13), %ebx + psrldq $4, %xmm6 + xorl %r8d, %eax + movl %r9d, %ecx + rorl $(11-6), %eax + pxor %xmm6, %xmm4 + movdqa %xmm2, %xmm5 + xorl %r12d, %ebx + xorl %r10d, %ecx + xorl %r8d, %eax + paddd %xmm1, %xmm4 + movdqa %xmm1, %xmm7 + andl %r8d, %ecx + rorl $(13-2), %ebx + xorl %r12d, %ebx + pslldq $12, %xmm5 + psrldq $4, %xmm7 + rorl $6, %eax + xorl %r10d, %ecx + pxor %xmm7, %xmm5 + rorl $2, %ebx + addl %eax, %ecx + addl (%rsp) , %ecx + movdqa %xmm5, %xmm6 + movl %r12d, %eax + addl %ecx, %r11d + movl %r12d, %ecx + movdqa %xmm5, %xmm7 + orl %r14d, %eax + addl %r11d, %r15d + andl %r14d, %ecx + pslld $(32-7), %xmm5 + psrld $7, %xmm6 + andl %r13d, %eax + addl %ebx, %r11d + orl %ecx, %eax + por %xmm6, %xmm5 + addl %eax, %r11d - paddd 0*16(%rdx), %xmm0 - movdqa %xmm0, (%rsp) - sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d - sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d - sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d - sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d - paddd 1*16(%rdx), %xmm1 - movdqa %xmm1, (%rsp) - sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d - sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d - sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d - sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d - paddd 2*16(%rdx), %xmm2 - movdqa %xmm2, (%rsp) - sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d - sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d - sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d - sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d - paddd 3*16(%rdx), %xmm3 - movdqa %xmm3, (%rsp) - sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d - sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d - sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d - sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d + movl %r15d, %eax + movdqa %xmm7, %xmm6 + movl %r11d, %ebx + rorl $(25-11), %eax + xorl %r15d, %eax + movdqa %xmm7, %xmm8 + movl %r8d, %ecx + rorl $(22-13), %ebx + xorl %r11d, %ebx + pslld $(32-18), %xmm7 + rorl $(11-6), %eax + xorl %r9d, %ecx + rorl $(13-2), %ebx + psrld $18, %xmm6 + xorl %r15d, %eax + andl %r15d, %ecx + rorl $6, %eax + pxor %xmm7, %xmm5 + xorl %r11d, %ebx + xorl %r9d, %ecx + psrld $3, %xmm8 + addl %eax, %ecx + addl 1*4(%rsp), %ecx + rorl $2, %ebx + pxor %xmm6, %xmm5 + movl %r11d, %eax + addl %ecx, %r10d + movl %r11d, %ecx + pxor %xmm8, %xmm5 + orl %r13d, %eax + addl %r10d, %r14d + andl %r13d, %ecx + pshufd $0xfa, %xmm0, %xmm6 + andl %r12d, %eax + addl %ebx, %r10d + paddd %xmm5, %xmm4 + orl %ecx, %eax + addl %eax, %r10d - addl %r8d, 0*4(%rdi) - addl %r9d, 1*4(%rdi) - addl %r10d, 2*4(%rdi) - addl %r11d, 3*4(%rdi) - addl %r12d, 4*4(%rdi) - addl %r13d, 5*4(%rdi) - addl %r14d, 6*4(%rdi) - addl %r15d, 7*4(%rdi) + movl %r14d, %eax + movdqa %xmm6, %xmm7 + movl %r10d, %ebx + rorl $(25-11), %eax + xorl %r14d, %eax + movdqa %xmm6, %xmm8 + rorl $(22-13), %ebx + movl %r15d, %ecx + xorl %r10d, %ebx + psrlq $17, %xmm6 + psrlq $19, %xmm7 + rorl $(11-6), %eax + xorl %r8d, %ecx + xorl %r14d, %eax + psrld $10, %xmm8 + pxor %xmm7, %xmm6 + andl %r14d, %ecx + rorl $(13-2), %ebx + xorl %r10d, %ebx + pxor %xmm6, %xmm8 + xorl %r8d, %ecx + rorl $6, %eax + addl %eax, %ecx + pshufd $0x8f, %xmm8, %xmm8 + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r10d, %eax + psrldq $8, %xmm8 + addl %ecx, %r9d + movl %r10d, %ecx + orl %r12d, %eax + paddd %xmm8, %xmm4 + addl %r9d, %r13d + andl %r12d, %ecx + andl %r11d, %eax + pshufd $0x50, %xmm4, %xmm6 + addl %ebx, %r9d + orl %ecx, %eax + addl %eax, %r9d -#if defined(_WIN64) || defined(__CYGWIN__) - movdqa 1*16(%rsp), %xmm6 - movdqa 2*16(%rsp), %xmm7 - movdqa 3*16(%rsp), %xmm8 - movdqa 4*16(%rsp), %xmm9 - addq $5*16, %rsp - popq %rsi - popq %rdi -#else - addq $16, %rsp -#endif - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - ret - - - .text - .p2align 6 -sha256_transform_phe: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx -#endif - movq %rsp, %r8 - subq $64, %rsp - andq $-64, %rsp - - testq %rdx, %rdx - jnz sha256_transform_phe_noswap - - movl 0*4(%rsi), %eax - movl 1*4(%rsi), %ecx - movl 2*4(%rsi), %edx - movl 3*4(%rsi), %r9d - bswapl %eax - bswapl %ecx - bswapl %edx - bswapl %r9d - movl %eax, 0*4(%rsp) - movl %ecx, 1*4(%rsp) - movl %edx, 2*4(%rsp) - movl %r9d, 3*4(%rsp) - movl 4*4(%rsi), %eax - movl 5*4(%rsi), %ecx - movl 6*4(%rsi), %edx - movl 7*4(%rsi), %r9d - bswapl %eax - bswapl %ecx - bswapl %edx - bswapl %r9d - movl %eax, 4*4(%rsp) - movl %ecx, 5*4(%rsp) - movl %edx, 6*4(%rsp) - movl %r9d, 7*4(%rsp) + movdqa %xmm6, %xmm7 + movl %r13d, %eax + rorl $(25-11), %eax + movl %r9d, %ebx + movdqa %xmm6, %xmm1 + rorl $(22-13), %ebx + xorl %r13d, %eax + movl %r14d, %ecx + psrlq $17, %xmm6 + rorl $(11-6), %eax + xorl %r9d, %ebx + xorl %r15d, %ecx + psrlq $19, %xmm7 + xorl %r13d, %eax + andl %r13d, %ecx + rorl $(13-2), %ebx + psrld $10, %xmm1 + xorl %r9d, %ebx + rorl $6, %eax + pxor %xmm7, %xmm6 + xorl %r15d, %ecx + rorl $2, %ebx + addl %eax, %ecx + pxor %xmm6, %xmm1 + addl 3*4(%rsp), %ecx + movl %r9d, %eax + addl %ecx, %r8d + pshufd $0xf8, %xmm1, %xmm1 + movl %r9d, %ecx + orl %r11d, %eax + addl %r8d, %r12d + pslldq $8, %xmm1 + andl %r11d, %ecx + andl %r10d, %eax + paddd %xmm4, %xmm1 + addl %ebx, %r8d + orl %ecx, %eax + addl %eax, %r8d + movdqa 2*16(%rdx), %xmm9 + paddd %xmm2, %xmm9 + movdqa %xmm9, (%rsp) + movdqa %xmm1, %xmm4 + movl %r12d, %eax + movdqa %xmm0, %xmm6 + rorl $(25-11), %eax + movl %r8d, %ebx + pslldq $12, %xmm4 + rorl $(22-13), %ebx + psrldq $4, %xmm6 + xorl %r12d, %eax + movl %r13d, %ecx + rorl $(11-6), %eax + pxor %xmm6, %xmm4 + movdqa %xmm3, %xmm5 + xorl %r8d, %ebx + xorl %r14d, %ecx + xorl %r12d, %eax + paddd %xmm2, %xmm4 + movdqa %xmm2, %xmm7 + andl %r12d, %ecx + rorl $(13-2), %ebx + xorl %r8d, %ebx + pslldq $12, %xmm5 + psrldq $4, %xmm7 + rorl $6, %eax + xorl %r14d, %ecx + pxor %xmm7, %xmm5 + rorl $2, %ebx + addl %eax, %ecx + addl (%rsp) , %ecx + movdqa %xmm5, %xmm6 + movl %r8d, %eax + addl %ecx, %r15d + movl %r8d, %ecx + movdqa %xmm5, %xmm7 + orl %r10d, %eax + addl %r15d, %r11d + andl %r10d, %ecx + pslld $(32-7), %xmm5 + psrld $7, %xmm6 + andl %r9d, %eax + addl %ebx, %r15d + orl %ecx, %eax + por %xmm6, %xmm5 + addl %eax, %r15d - movdqu 2*16(%rsi), %xmm0 - movdqu 3*16(%rsi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, 2*16(%rsp) - movdqa %xmm2, 3*16(%rsp) + movl %r11d, %eax + movdqa %xmm7, %xmm6 + movl %r15d, %ebx + rorl $(25-11), %eax + xorl %r11d, %eax + movdqa %xmm7, %xmm8 + movl %r12d, %ecx + rorl $(22-13), %ebx + xorl %r15d, %ebx + pslld $(32-18), %xmm7 + rorl $(11-6), %eax + xorl %r13d, %ecx + rorl $(13-2), %ebx + psrld $18, %xmm6 + xorl %r11d, %eax + andl %r11d, %ecx + rorl $6, %eax + pxor %xmm7, %xmm5 + xorl %r15d, %ebx + xorl %r13d, %ecx + psrld $3, %xmm8 + addl %eax, %ecx + addl 1*4(%rsp), %ecx + rorl $2, %ebx + pxor %xmm6, %xmm5 + movl %r15d, %eax + addl %ecx, %r14d + movl %r15d, %ecx + pxor %xmm8, %xmm5 + orl %r9d, %eax + addl %r14d, %r10d + andl %r9d, %ecx + pshufd $0xfa, %xmm1, %xmm6 + andl %r8d, %eax + addl %ebx, %r14d + paddd %xmm5, %xmm4 + orl %ecx, %eax + addl %eax, %r14d - jmp sha256_transform_phe_core + movl %r10d, %eax + movdqa %xmm6, %xmm7 + movl %r14d, %ebx + rorl $(25-11), %eax + xorl %r10d, %eax + movdqa %xmm6, %xmm8 + rorl $(22-13), %ebx + movl %r11d, %ecx + xorl %r14d, %ebx + psrlq $17, %xmm6 + psrlq $19, %xmm7 + rorl $(11-6), %eax + xorl %r12d, %ecx + xorl %r10d, %eax + psrld $10, %xmm8 + pxor %xmm7, %xmm6 + andl %r10d, %ecx + rorl $(13-2), %ebx + xorl %r14d, %ebx + pxor %xmm6, %xmm8 + xorl %r12d, %ecx + rorl $6, %eax + addl %eax, %ecx + pshufd $0x8f, %xmm8, %xmm8 + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r14d, %eax + psrldq $8, %xmm8 + addl %ecx, %r13d + movl %r14d, %ecx + orl %r8d, %eax + paddd %xmm8, %xmm4 + addl %r13d, %r9d + andl %r8d, %ecx + andl %r15d, %eax + pshufd $0x50, %xmm4, %xmm6 + addl %ebx, %r13d + orl %ecx, %eax + addl %eax, %r13d -sha256_transform_phe_noswap: - movdqu 0*16(%rsi), %xmm0 - movdqu 1*16(%rsi), %xmm1 - movdqu 2*16(%rsi), %xmm2 - movdqu 3*16(%rsi), %xmm3 - movdqa %xmm0, 0*16(%rsp) - movdqa %xmm1, 1*16(%rsp) - movdqa %xmm2, 2*16(%rsp) - movdqa %xmm3, 3*16(%rsp) + movdqa %xmm6, %xmm7 + movl %r9d, %eax + rorl $(25-11), %eax + movl %r13d, %ebx + movdqa %xmm6, %xmm2 + rorl $(22-13), %ebx + xorl %r9d, %eax + movl %r10d, %ecx + psrlq $17, %xmm6 + rorl $(11-6), %eax + xorl %r13d, %ebx + xorl %r11d, %ecx + psrlq $19, %xmm7 + xorl %r9d, %eax + andl %r9d, %ecx + rorl $(13-2), %ebx + psrld $10, %xmm2 + xorl %r13d, %ebx + rorl $6, %eax + pxor %xmm7, %xmm6 + xorl %r11d, %ecx + rorl $2, %ebx + addl %eax, %ecx + pxor %xmm6, %xmm2 + addl 3*4(%rsp), %ecx + movl %r13d, %eax + addl %ecx, %r12d + pshufd $0xf8, %xmm2, %xmm2 + movl %r13d, %ecx + orl %r15d, %eax + addl %r12d, %r8d + pslldq $8, %xmm2 + andl %r15d, %ecx + andl %r14d, %eax + paddd %xmm4, %xmm2 + addl %ebx, %r12d + orl %ecx, %eax + addl %eax, %r12d + movdqa 3*16(%rdx), %xmm9 + paddd %xmm3, %xmm9 + movdqa %xmm9, (%rsp) + addq $4*16, %rdx + movdqa %xmm2, %xmm4 + movl %r8d, %eax + movdqa %xmm1, %xmm6 + rorl $(25-11), %eax + movl %r12d, %ebx + pslldq $12, %xmm4 + rorl $(22-13), %ebx + psrldq $4, %xmm6 + xorl %r8d, %eax + movl %r9d, %ecx + rorl $(11-6), %eax + pxor %xmm6, %xmm4 + movdqa %xmm0, %xmm5 + xorl %r12d, %ebx + xorl %r10d, %ecx + xorl %r8d, %eax + paddd %xmm3, %xmm4 + movdqa %xmm3, %xmm7 + andl %r8d, %ecx + rorl $(13-2), %ebx + xorl %r12d, %ebx + pslldq $12, %xmm5 + psrldq $4, %xmm7 + rorl $6, %eax + xorl %r10d, %ecx + pxor %xmm7, %xmm5 + rorl $2, %ebx + addl %eax, %ecx + addl (%rsp) , %ecx + movdqa %xmm5, %xmm6 + movl %r12d, %eax + addl %ecx, %r11d + movl %r12d, %ecx + movdqa %xmm5, %xmm7 + orl %r14d, %eax + addl %r11d, %r15d + andl %r14d, %ecx + pslld $(32-7), %xmm5 + psrld $7, %xmm6 + andl %r13d, %eax + addl %ebx, %r11d + orl %ecx, %eax + por %xmm6, %xmm5 + addl %eax, %r11d -sha256_transform_phe_core: - movq %rsp, %rsi - movq $-1, %rax - movq $1, %rcx - /* rep xsha256 */ - .byte 0xf3, 0x0f, 0xa6, 0xd0 + movl %r15d, %eax + movdqa %xmm7, %xmm6 + movl %r11d, %ebx + rorl $(25-11), %eax + xorl %r15d, %eax + movdqa %xmm7, %xmm8 + movl %r8d, %ecx + rorl $(22-13), %ebx + xorl %r11d, %ebx + pslld $(32-18), %xmm7 + rorl $(11-6), %eax + xorl %r9d, %ecx + rorl $(13-2), %ebx + psrld $18, %xmm6 + xorl %r15d, %eax + andl %r15d, %ecx + rorl $6, %eax + pxor %xmm7, %xmm5 + xorl %r11d, %ebx + xorl %r9d, %ecx + psrld $3, %xmm8 + addl %eax, %ecx + addl 1*4(%rsp), %ecx + rorl $2, %ebx + pxor %xmm6, %xmm5 + movl %r11d, %eax + addl %ecx, %r10d + movl %r11d, %ecx + pxor %xmm8, %xmm5 + orl %r13d, %eax + addl %r10d, %r14d + andl %r13d, %ecx + pshufd $0xfa, %xmm2, %xmm6 + andl %r12d, %eax + addl %ebx, %r10d + paddd %xmm5, %xmm4 + orl %ecx, %eax + addl %eax, %r10d - movq %r8, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - popq %rdi -#endif - ret + movl %r14d, %eax + movdqa %xmm6, %xmm7 + movl %r10d, %ebx + rorl $(25-11), %eax + xorl %r14d, %eax + movdqa %xmm6, %xmm8 + rorl $(22-13), %ebx + movl %r15d, %ecx + xorl %r10d, %ebx + psrlq $17, %xmm6 + psrlq $19, %xmm7 + rorl $(11-6), %eax + xorl %r8d, %ecx + xorl %r14d, %eax + psrld $10, %xmm8 + pxor %xmm7, %xmm6 + andl %r14d, %ecx + rorl $(13-2), %ebx + xorl %r10d, %ebx + pxor %xmm6, %xmm8 + xorl %r8d, %ecx + rorl $6, %eax + addl %eax, %ecx + pshufd $0x8f, %xmm8, %xmm8 + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r10d, %eax + psrldq $8, %xmm8 + addl %ecx, %r9d + movl %r10d, %ecx + orl %r12d, %eax + paddd %xmm8, %xmm4 + addl %r9d, %r13d + andl %r12d, %ecx + andl %r11d, %eax + pshufd $0x50, %xmm4, %xmm6 + addl %ebx, %r9d + orl %ecx, %eax + addl %eax, %r9d + movdqa %xmm6, %xmm7 + movl %r13d, %eax + rorl $(25-11), %eax + movl %r9d, %ebx + movdqa %xmm6, %xmm3 + rorl $(22-13), %ebx + xorl %r13d, %eax + movl %r14d, %ecx + psrlq $17, %xmm6 + rorl $(11-6), %eax + xorl %r9d, %ebx + xorl %r15d, %ecx + psrlq $19, %xmm7 + xorl %r13d, %eax + andl %r13d, %ecx + rorl $(13-2), %ebx + psrld $10, %xmm3 + xorl %r9d, %ebx + rorl $6, %eax + pxor %xmm7, %xmm6 + xorl %r15d, %ecx + rorl $2, %ebx + addl %eax, %ecx + pxor %xmm6, %xmm3 + addl 3*4(%rsp), %ecx + movl %r9d, %eax + addl %ecx, %r8d + pshufd $0xf8, %xmm3, %xmm3 + movl %r9d, %ecx + orl %r11d, %eax + addl %r8d, %r12d + pslldq $8, %xmm3 + andl %r11d, %ecx + andl %r10d, %eax + paddd %xmm4, %xmm3 + addl %ebx, %r8d + orl %ecx, %eax + addl %eax, %r8d - .data - .p2align 3 -sha256_transform_addr: - .quad sha256_transform_sse2 + subq $16, %rsi + jne sha256_transform_sse2_loop - .text - .p2align 3 - .globl sha256_transform - .globl _sha256_transform -sha256_transform: -_sha256_transform: - jmp *sha256_transform_addr(%rip) + paddd 0*16(%rdx), %xmm0 + movdqa %xmm0, (%rsp) + movl %r12d, %eax + rorl $(25-11), %eax + movl %r8d, %ebx + xorl %r12d, %eax + rorl $(22-13), %ebx + movl %r13d, %ecx + xorl %r8d, %ebx + rorl $(11-6), %eax + xorl %r14d, %ecx + xorl %r12d, %eax + rorl $(13-2), %ebx + andl %r12d, %ecx + xorl %r8d, %ebx + rorl $6, %eax + xorl %r14d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 0*4(%rsp), %ecx + movl %r8d, %eax + addl %ecx, %r15d + movl %r8d, %ecx + orl %r10d, %eax + addl %r15d, %r11d + andl %r10d, %ecx + andl %r9d, %eax + addl %ebx, %r15d + orl %ecx, %eax + addl %eax, %r15d + movl %r11d, %eax + rorl $(25-11), %eax + movl %r15d, %ebx + xorl %r11d, %eax + rorl $(22-13), %ebx + movl %r12d, %ecx + xorl %r15d, %ebx + rorl $(11-6), %eax + xorl %r13d, %ecx + xorl %r11d, %eax + rorl $(13-2), %ebx + andl %r11d, %ecx + xorl %r15d, %ebx + rorl $6, %eax + xorl %r13d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 1*4(%rsp), %ecx + movl %r15d, %eax + addl %ecx, %r14d + movl %r15d, %ecx + orl %r9d, %eax + addl %r14d, %r10d + andl %r9d, %ecx + andl %r8d, %eax + addl %ebx, %r14d + orl %ecx, %eax + addl %eax, %r14d + movl %r10d, %eax + rorl $(25-11), %eax + movl %r14d, %ebx + xorl %r10d, %eax + rorl $(22-13), %ebx + movl %r11d, %ecx + xorl %r14d, %ebx + rorl $(11-6), %eax + xorl %r12d, %ecx + xorl %r10d, %eax + rorl $(13-2), %ebx + andl %r10d, %ecx + xorl %r14d, %ebx + rorl $6, %eax + xorl %r12d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r14d, %eax + addl %ecx, %r13d + movl %r14d, %ecx + orl %r8d, %eax + addl %r13d, %r9d + andl %r8d, %ecx + andl %r15d, %eax + addl %ebx, %r13d + orl %ecx, %eax + addl %eax, %r13d + movl %r9d, %eax + rorl $(25-11), %eax + movl %r13d, %ebx + xorl %r9d, %eax + rorl $(22-13), %ebx + movl %r10d, %ecx + xorl %r13d, %ebx + rorl $(11-6), %eax + xorl %r11d, %ecx + xorl %r9d, %eax + rorl $(13-2), %ebx + andl %r9d, %ecx + xorl %r13d, %ebx + rorl $6, %eax + xorl %r11d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 3*4(%rsp), %ecx + movl %r13d, %eax + addl %ecx, %r12d + movl %r13d, %ecx + orl %r15d, %eax + addl %r12d, %r8d + andl %r15d, %ecx + andl %r14d, %eax + addl %ebx, %r12d + orl %ecx, %eax + addl %eax, %r12d + paddd 1*16(%rdx), %xmm1 + movdqa %xmm1, (%rsp) + movl %r8d, %eax + rorl $(25-11), %eax + movl %r12d, %ebx + xorl %r8d, %eax + rorl $(22-13), %ebx + movl %r9d, %ecx + xorl %r12d, %ebx + rorl $(11-6), %eax + xorl %r10d, %ecx + xorl %r8d, %eax + rorl $(13-2), %ebx + andl %r8d, %ecx + xorl %r12d, %ebx + rorl $6, %eax + xorl %r10d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 0*4(%rsp), %ecx + movl %r12d, %eax + addl %ecx, %r11d + movl %r12d, %ecx + orl %r14d, %eax + addl %r11d, %r15d + andl %r14d, %ecx + andl %r13d, %eax + addl %ebx, %r11d + orl %ecx, %eax + addl %eax, %r11d + movl %r15d, %eax + rorl $(25-11), %eax + movl %r11d, %ebx + xorl %r15d, %eax + rorl $(22-13), %ebx + movl %r8d, %ecx + xorl %r11d, %ebx + rorl $(11-6), %eax + xorl %r9d, %ecx + xorl %r15d, %eax + rorl $(13-2), %ebx + andl %r15d, %ecx + xorl %r11d, %ebx + rorl $6, %eax + xorl %r9d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 1*4(%rsp), %ecx + movl %r11d, %eax + addl %ecx, %r10d + movl %r11d, %ecx + orl %r13d, %eax + addl %r10d, %r14d + andl %r13d, %ecx + andl %r12d, %eax + addl %ebx, %r10d + orl %ecx, %eax + addl %eax, %r10d + movl %r14d, %eax + rorl $(25-11), %eax + movl %r10d, %ebx + xorl %r14d, %eax + rorl $(22-13), %ebx + movl %r15d, %ecx + xorl %r10d, %ebx + rorl $(11-6), %eax + xorl %r8d, %ecx + xorl %r14d, %eax + rorl $(13-2), %ebx + andl %r14d, %ecx + xorl %r10d, %ebx + rorl $6, %eax + xorl %r8d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r10d, %eax + addl %ecx, %r9d + movl %r10d, %ecx + orl %r12d, %eax + addl %r9d, %r13d + andl %r12d, %ecx + andl %r11d, %eax + addl %ebx, %r9d + orl %ecx, %eax + addl %eax, %r9d + movl %r13d, %eax + rorl $(25-11), %eax + movl %r9d, %ebx + xorl %r13d, %eax + rorl $(22-13), %ebx + movl %r14d, %ecx + xorl %r9d, %ebx + rorl $(11-6), %eax + xorl %r15d, %ecx + xorl %r13d, %eax + rorl $(13-2), %ebx + andl %r13d, %ecx + xorl %r9d, %ebx + rorl $6, %eax + xorl %r15d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 3*4(%rsp), %ecx + movl %r9d, %eax + addl %ecx, %r8d + movl %r9d, %ecx + orl %r11d, %eax + addl %r8d, %r12d + andl %r11d, %ecx + andl %r10d, %eax + addl %ebx, %r8d + orl %ecx, %eax + addl %eax, %r8d + paddd 2*16(%rdx), %xmm2 + movdqa %xmm2, (%rsp) + movl %r12d, %eax + rorl $(25-11), %eax + movl %r8d, %ebx + xorl %r12d, %eax + rorl $(22-13), %ebx + movl %r13d, %ecx + xorl %r8d, %ebx + rorl $(11-6), %eax + xorl %r14d, %ecx + xorl %r12d, %eax + rorl $(13-2), %ebx + andl %r12d, %ecx + xorl %r8d, %ebx + rorl $6, %eax + xorl %r14d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 0*4(%rsp), %ecx + movl %r8d, %eax + addl %ecx, %r15d + movl %r8d, %ecx + orl %r10d, %eax + addl %r15d, %r11d + andl %r10d, %ecx + andl %r9d, %eax + addl %ebx, %r15d + orl %ecx, %eax + addl %eax, %r15d + movl %r11d, %eax + rorl $(25-11), %eax + movl %r15d, %ebx + xorl %r11d, %eax + rorl $(22-13), %ebx + movl %r12d, %ecx + xorl %r15d, %ebx + rorl $(11-6), %eax + xorl %r13d, %ecx + xorl %r11d, %eax + rorl $(13-2), %ebx + andl %r11d, %ecx + xorl %r15d, %ebx + rorl $6, %eax + xorl %r13d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 1*4(%rsp), %ecx + movl %r15d, %eax + addl %ecx, %r14d + movl %r15d, %ecx + orl %r9d, %eax + addl %r14d, %r10d + andl %r9d, %ecx + andl %r8d, %eax + addl %ebx, %r14d + orl %ecx, %eax + addl %eax, %r14d + movl %r10d, %eax + rorl $(25-11), %eax + movl %r14d, %ebx + xorl %r10d, %eax + rorl $(22-13), %ebx + movl %r11d, %ecx + xorl %r14d, %ebx + rorl $(11-6), %eax + xorl %r12d, %ecx + xorl %r10d, %eax + rorl $(13-2), %ebx + andl %r10d, %ecx + xorl %r14d, %ebx + rorl $6, %eax + xorl %r12d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r14d, %eax + addl %ecx, %r13d + movl %r14d, %ecx + orl %r8d, %eax + addl %r13d, %r9d + andl %r8d, %ecx + andl %r15d, %eax + addl %ebx, %r13d + orl %ecx, %eax + addl %eax, %r13d + movl %r9d, %eax + rorl $(25-11), %eax + movl %r13d, %ebx + xorl %r9d, %eax + rorl $(22-13), %ebx + movl %r10d, %ecx + xorl %r13d, %ebx + rorl $(11-6), %eax + xorl %r11d, %ecx + xorl %r9d, %eax + rorl $(13-2), %ebx + andl %r9d, %ecx + xorl %r13d, %ebx + rorl $6, %eax + xorl %r11d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 3*4(%rsp), %ecx + movl %r13d, %eax + addl %ecx, %r12d + movl %r13d, %ecx + orl %r15d, %eax + addl %r12d, %r8d + andl %r15d, %ecx + andl %r14d, %eax + addl %ebx, %r12d + orl %ecx, %eax + addl %eax, %r12d + paddd 3*16(%rdx), %xmm3 + movdqa %xmm3, (%rsp) + movl %r8d, %eax + rorl $(25-11), %eax + movl %r12d, %ebx + xorl %r8d, %eax + rorl $(22-13), %ebx + movl %r9d, %ecx + xorl %r12d, %ebx + rorl $(11-6), %eax + xorl %r10d, %ecx + xorl %r8d, %eax + rorl $(13-2), %ebx + andl %r8d, %ecx + xorl %r12d, %ebx + rorl $6, %eax + xorl %r10d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 0*4(%rsp), %ecx + movl %r12d, %eax + addl %ecx, %r11d + movl %r12d, %ecx + orl %r14d, %eax + addl %r11d, %r15d + andl %r14d, %ecx + andl %r13d, %eax + addl %ebx, %r11d + orl %ecx, %eax + addl %eax, %r11d + movl %r15d, %eax + rorl $(25-11), %eax + movl %r11d, %ebx + xorl %r15d, %eax + rorl $(22-13), %ebx + movl %r8d, %ecx + xorl %r11d, %ebx + rorl $(11-6), %eax + xorl %r9d, %ecx + xorl %r15d, %eax + rorl $(13-2), %ebx + andl %r15d, %ecx + xorl %r11d, %ebx + rorl $6, %eax + xorl %r9d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 1*4(%rsp), %ecx + movl %r11d, %eax + addl %ecx, %r10d + movl %r11d, %ecx + orl %r13d, %eax + addl %r10d, %r14d + andl %r13d, %ecx + andl %r12d, %eax + addl %ebx, %r10d + orl %ecx, %eax + addl %eax, %r10d + movl %r14d, %eax + rorl $(25-11), %eax + movl %r10d, %ebx + xorl %r14d, %eax + rorl $(22-13), %ebx + movl %r15d, %ecx + xorl %r10d, %ebx + rorl $(11-6), %eax + xorl %r8d, %ecx + xorl %r14d, %eax + rorl $(13-2), %ebx + andl %r14d, %ecx + xorl %r10d, %ebx + rorl $6, %eax + xorl %r8d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl %r10d, %eax + addl %ecx, %r9d + movl %r10d, %ecx + orl %r12d, %eax + addl %r9d, %r13d + andl %r12d, %ecx + andl %r11d, %eax + addl %ebx, %r9d + orl %ecx, %eax + addl %eax, %r9d + movl %r13d, %eax + rorl $(25-11), %eax + movl %r9d, %ebx + xorl %r13d, %eax + rorl $(22-13), %ebx + movl %r14d, %ecx + xorl %r9d, %ebx + rorl $(11-6), %eax + xorl %r15d, %ecx + xorl %r13d, %eax + rorl $(13-2), %ebx + andl %r13d, %ecx + xorl %r9d, %ebx + rorl $6, %eax + xorl %r15d, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl 3*4(%rsp), %ecx + movl %r9d, %eax + addl %ecx, %r8d + movl %r9d, %ecx + orl %r11d, %eax + addl %r8d, %r12d + andl %r11d, %ecx + andl %r10d, %eax + addl %ebx, %r8d + orl %ecx, %eax + addl %eax, %r8d + + addl %r8d, 0*4(%rdi) + addl %r9d, 1*4(%rdi) + addl %r10d, 2*4(%rdi) + addl %r11d, 3*4(%rdi) + addl %r12d, 4*4(%rdi) + addl %r13d, 5*4(%rdi) + addl %r14d, 6*4(%rdi) + addl %r15d, 7*4(%rdi) + +#if defined(_WIN64) || defined(__CYGWIN__) + movdqa 1*16(%rsp), %xmm6 + movdqa 2*16(%rsp), %xmm7 + movdqa 3*16(%rsp), %xmm8 + movdqa 4*16(%rsp), %xmm9 + addq $5*16, %rsp + popq %rsi + popq %rdi +#else + addq $16, %rsp +#endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret .text .p2align 6 - .globl sha256d_ms - .globl _sha256d_ms -sha256d_ms: -_sha256d_ms: +sha256_transform_phe: #if defined(_WIN64) || defined(__CYGWIN__) pushq %rdi pushq %rsi @@ -506,13 +1285,11 @@ _sha256d_ms: movq %r8, %rdx #endif movq %rsp, %r8 - subq $32, %rsp - andq $-32, %rsp + subq $64, %rsp + andq $-64, %rsp - movdqa 0*16(%rdx), %xmm0 - movdqa 1*16(%rdx), %xmm1 - movdqa %xmm0, 0*16(%rdi) - movdqa %xmm1, 1*16(%rdi) + testq %rdx, %rdx + jnz sha256_transform_phe_noswap movl 0*4(%rsi), %eax movl 1*4(%rsi), %ecx @@ -526,29 +1303,134 @@ _sha256d_ms: movl %ecx, 1*4(%rsp) movl %edx, 2*4(%rsp) movl %r9d, 3*4(%rsp) + movl 4*4(%rsi), %eax + movl 5*4(%rsi), %ecx + movl 6*4(%rsi), %edx + movl 7*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 4*4(%rsp) + movl %ecx, 5*4(%rsp) + movl %edx, 6*4(%rsp) + movl %r9d, 7*4(%rsp) - movq %rsp, %rsi - movl $64, %eax - movl $80, %ecx - /* rep xsha256 */ - .byte 0xf3, 0x0f, 0xa6, 0xd0 + movdqu 2*16(%rsi), %xmm0 + movdqu 3*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 2*16(%rsp) + movdqa %xmm2, 3*16(%rsp) - movdqa bswap_xmm_mask(%rip), %xmm1 - movdqa 0*16(%rdi), %xmm0 - movdqa 1*16(%rdi), %xmm2 - pshufb %xmm1, %xmm0 - pshufb %xmm1, %xmm2 - movdqa %xmm0, 0*16(%rsp) - movdqa %xmm2, 1*16(%rsp) + jmp sha256_transform_phe_core - movdqa sha256_h+0*16(%rip), %xmm0 - movdqa sha256_h+1*16(%rip), %xmm1 - movdqa %xmm0, 0*16(%rdi) - movdqa %xmm1, 1*16(%rdi) +sha256_transform_phe_noswap: + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) +sha256_transform_phe_core: movq %rsp, %rsi - xorq %rax, %rax - movl $32, %ecx + movq $-1, %rax + movq $1, %rcx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi +#endif + ret + + + .data + .p2align 3 +sha256_transform_addr: + .quad sha256_transform_sse2 + + .text + .p2align 3 + .globl sha256_transform + .globl _sha256_transform +sha256_transform: +_sha256_transform: + jmp *sha256_transform_addr(%rip) + + + .text + .p2align 6 + .globl sha256d_ms + .globl _sha256d_ms +sha256d_ms: +_sha256d_ms: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $32, %rsp + andq $-32, %rsp + + movdqa 0*16(%rdx), %xmm0 + movdqa 1*16(%rdx), %xmm1 + movdqa %xmm0, 0*16(%rdi) + movdqa %xmm1, 1*16(%rdi) + + movl 0*4(%rsi), %eax + movl 1*4(%rsi), %ecx + movl 2*4(%rsi), %edx + movl 3*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 0*4(%rsp) + movl %ecx, 1*4(%rsp) + movl %edx, 2*4(%rsp) + movl %r9d, 3*4(%rsp) + + movq %rsp, %rsi + movl $64, %eax + movl $80, %ecx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movdqa bswap_xmm_mask(%rip), %xmm1 + movdqa 0*16(%rdi), %xmm0 + movdqa 1*16(%rdi), %xmm2 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm2 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm2, 1*16(%rsp) + + movdqa sha256_h+0*16(%rip), %xmm0 + movdqa sha256_h+1*16(%rip), %xmm1 + movdqa %xmm0, 0*16(%rdi) + movdqa %xmm1, 1*16(%rdi) + + movq %rsp, %rsi + xorq %rax, %rax + movl $32, %ecx /* rep xsha256 */ .byte 0xf3, 0x0f, 0xa6, 0xd0 @@ -814,40 +1696,48 @@ _sha256_init_8way: #endif /* USE_AVX2 */ -.macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%rax), %xmm0 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd (\i-16)*16(%rax), %xmm0 - paddd (\i-7)*16(%rax), %xmm0 - movdqa %xmm3, %xmm2 - psrld $10, %xmm3 - pslld $13, %xmm2 - movdqa %xmm3, %xmm1 - psrld $7, %xmm1 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - psrld $2, %xmm1 - pslld $2, %xmm2 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - paddd %xmm0, %xmm3 - movdqa %xmm3, \i*16(%rax) -.endm -.macro sha256_sse2_extend_doubleround i - movdqa (\i-15)*16(%rax), %xmm0 - movdqa (\i-14)*16(%rax), %xmm4 + + + +#if defined(USE_AVX) + + + + + +#endif /* USE_AVX */ + + +#if defined(USE_AVX2) + + + + + +#endif /* USE_AVX2 */ + + +#if defined(USE_XOP) + + + + + +#endif /* USE_XOP */ + + + .text + .p2align 6 +sha256_transform_4way_core_sse2: + leaq 256(%rsp), %rcx + leaq 48*16(%rcx), %rax + movdqa -2*16(%rcx), %xmm3 + movdqa -1*16(%rcx), %xmm7 +sha256_transform_4way_sse2_extend_loop: + movdqa -15*16(%rcx), %xmm0 + movdqa -14*16(%rcx), %xmm4 movdqa %xmm0, %xmm2 movdqa %xmm4, %xmm6 psrld $3, %xmm0 @@ -871,8 +1761,8 @@ _sha256_init_8way: pxor %xmm2, %xmm0 pxor %xmm6, %xmm4 - paddd (\i-16)*16(%rax), %xmm0 - paddd (\i-15)*16(%rax), %xmm4 + paddd -16*16(%rcx), %xmm0 + paddd -15*16(%rcx), %xmm4 movdqa %xmm3, %xmm2 movdqa %xmm7, %xmm6 @@ -885,8 +1775,8 @@ _sha256_init_8way: psrld $7, %xmm1 psrld $7, %xmm5 - paddd (\i-7)*16(%rax), %xmm0 - paddd (\i-6)*16(%rax), %xmm4 + paddd -7*16(%rcx), %xmm0 + paddd -6*16(%rcx), %xmm4 pxor %xmm1, %xmm3 pxor %xmm5, %xmm7 @@ -903,31 +1793,44 @@ _sha256_init_8way: paddd %xmm0, %xmm3 paddd %xmm4, %xmm7 - movdqa %xmm3, \i*16(%rax) - movdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_sse2_main_round i - movdqa 16*(\i)(%rax), %xmm6 + movdqa %xmm3, (%rcx) + movdqa %xmm7, 16(%rcx) + addq $2*16, %rcx + cmpq %rcx, %rax + jne sha256_transform_4way_sse2_extend_loop + + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + + leaq sha256_4k(%rip), %rcx + xorq %rax, %rax +sha256_transform_4way_sse2_main_loop: + movdqa (%rsp, %rax), %xmm6 + paddd (%rcx, %rax), %xmm6 + paddd %xmm10, %xmm6 movdqa %xmm0, %xmm1 - movdqa 16(%rsp), %xmm2 + movdqa %xmm9, %xmm2 pandn %xmm2, %xmm1 - paddd 32(%rsp), %xmm6 - movdqa %xmm2, 32(%rsp) - movdqa 0(%rsp), %xmm2 - movdqa %xmm2, 16(%rsp) + movdqa %xmm2, %xmm10 + movdqa %xmm8, %xmm2 + movdqa %xmm2, %xmm9 pand %xmm0, %xmm2 pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%rsp) + movdqa %xmm0, %xmm8 paddd %xmm1, %xmm6 movdqa %xmm0, %xmm1 psrld $6, %xmm0 - paddd 16*(\i)(%rcx), %xmm6 movdqa %xmm0, %xmm2 pslld $7, %xmm1 psrld $5, %xmm2 @@ -936,19 +1839,20 @@ _sha256_init_8way: pslld $14, %xmm1 psrld $14, %xmm2 pxor %xmm1, %xmm0 - pslld $5, %xmm1 pxor %xmm2, %xmm0 + pslld $5, %xmm1 pxor %xmm1, %xmm0 - movdqa %xmm5, %xmm1 paddd %xmm0, %xmm6 movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 movdqa %xmm4, %xmm3 movdqa %xmm4, %xmm2 - paddd %xmm6, %xmm0 pand %xmm5, %xmm2 - pand %xmm7, %xmm1 pand %xmm7, %xmm4 + pand %xmm7, %xmm1 pxor %xmm4, %xmm1 movdqa %xmm5, %xmm4 movdqa %xmm7, %xmm5 @@ -961,56 +1865,81 @@ _sha256_init_8way: pslld $10, %xmm2 psrld $11, %xmm1 pxor %xmm2, %xmm7 - pslld $9, %xmm2 pxor %xmm1, %xmm7 + pslld $9, %xmm2 psrld $9, %xmm1 pxor %xmm2, %xmm7 - pslld $11, %xmm2 pxor %xmm1, %xmm7 + pslld $11, %xmm2 pxor %xmm2, %xmm7 paddd %xmm6, %xmm7 -.endm - -.macro sha256_sse2_main_quadround i - sha256_sse2_main_round \i+0 - sha256_sse2_main_round \i+1 - sha256_sse2_main_round \i+2 - sha256_sse2_main_round \i+3 -.endm + + addq $16, %rax + cmpq $16*64, %rax + jne sha256_transform_4way_sse2_main_loop + jmp sha256_transform_4way_finish #if defined(USE_AVX) - -.macro sha256_avx_extend_round i - vmovdqa (\i-15)*16(%rax), %xmm0 + .text + .p2align 6 +sha256_transform_4way_core_avx: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + vmovdqa (0-15)*16(%rax), %xmm0 + vmovdqa (0-14)*16(%rax), %xmm4 vpslld $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpsrld $4, %xmm0, %xmm1 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (0-16)*16(%rax), %xmm8, %xmm0 vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (0-7)*16(%rax), %xmm0, %xmm0 + vpaddd (0-6)*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 vpxor %xmm2, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, \i*16(%rax) -.endm + vpxor %xmm6, %xmm7, %xmm7 -.macro sha256_avx_extend_doubleround i - vmovdqa (\i-15)*16(%rax), %xmm0 - vmovdqa (\i-14)*16(%rax), %xmm4 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, (0+1)*16(%rax) + vmovdqa (2-15)*16(%rax), %xmm0 + vmovdqa (2-14)*16(%rax), %xmm4 vpslld $14, %xmm0, %xmm2 vpslld $14, %xmm4, %xmm6 vpsrld $3, %xmm0, %xmm8 @@ -1031,15 +1960,15 @@ _sha256_init_8way: vpxor %xmm6, %xmm4, %xmm4 vpaddd %xmm0, %xmm4, %xmm4 - vpaddd (\i-16)*16(%rax), %xmm8, %xmm0 + vpaddd (2-16)*16(%rax), %xmm8, %xmm0 vpslld $13, %xmm3, %xmm2 vpslld $13, %xmm7, %xmm6 vpsrld $10, %xmm3, %xmm3 vpsrld $10, %xmm7, %xmm7 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + vpaddd (2-7)*16(%rax), %xmm0, %xmm0 + vpaddd (2-6)*16(%rax), %xmm4, %xmm4 vpsrld $7, %xmm3, %xmm1 vpsrld $7, %xmm7, %xmm5 @@ -1058,1007 +1987,16151 @@ _sha256_init_8way: vpaddd %xmm0, %xmm3, %xmm3 vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, \i*16(%rax) - vmovdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 16*(\i)(%rax), \r0, %xmm6 - vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 - - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vpslld $7, \r3, %xmm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $14, %xmm1, %xmm1 - vpsrld $14, %xmm2, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $5, %xmm1, %xmm1 - vpxor %xmm1, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 - - vpand \r6, \r5, %xmm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %xmm1 - vpxor \r4, %xmm1, %xmm1 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vpslld $10, \r7, %xmm2 - vpsrld $2, \r7, \r4 - vpsrld $11, \r4, %xmm1 - vpxor %xmm2, \r4, \r4 - vpxor %xmm1, \r4, \r4 - vpslld $9, %xmm2, %xmm2 - vpsrld $9, %xmm1, %xmm1 - vpxor %xmm2, \r4, \r4 - vpxor %xmm1, \r4, \r4 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, (2+1)*16(%rax) + vmovdqa (4-15)*16(%rax), %xmm0 + vmovdqa (4-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 vpslld $11, %xmm2, %xmm2 - vpxor %xmm2, \r4, \r4 - vpaddd %xmm6, \r4, \r4 -.endm - -.macro sha256_avx_main_quadround i - sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 -.endm - -#endif /* USE_AVX */ - - -#if defined(USE_AVX2) - -.macro sha256_avx2_extend_round i - vmovdqa (\i-15)*32(%rax), %ymm0 - vpslld $14, %ymm0, %ymm2 - vpsrld $3, %ymm0, %ymm0 - vpsrld $4, %ymm0, %ymm1 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpsrld $11, %ymm1, %ymm1 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 - vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 - - vpslld $13, %ymm3, %ymm2 - vpsrld $10, %ymm3, %ymm3 - vpsrld $7, %ymm3, %ymm1 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm2, %ymm3, %ymm3 - vpsrld $2, %ymm1, %ymm1 - vpslld $2, %ymm2, %ymm2 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm2, %ymm3, %ymm3 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, \i*32(%rax) -.endm - -.macro sha256_avx2_extend_doubleround i - vmovdqa (\i-15)*32(%rax), %ymm0 - vmovdqa (\i-14)*32(%rax), %ymm4 - vpslld $14, %ymm0, %ymm2 - vpslld $14, %ymm4, %ymm6 - vpsrld $3, %ymm0, %ymm8 - vpsrld $3, %ymm4, %ymm4 - vpsrld $7, %ymm0, %ymm1 - vpsrld $4, %ymm4, %ymm5 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpsrld $11, %ymm1, %ymm1 - vpsrld $11, %ymm5, %ymm5 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - vpslld $11, %ymm2, %ymm2 - vpslld $11, %ymm6, %ymm6 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - - vpaddd %ymm0, %ymm4, %ymm4 - vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - - vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 - vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 - - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, \i*32(%rax) - vmovdqa %ymm7, (\i+1)*32(%rax) -.endm - -.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 32*(\i)(%rax), \r0, %ymm6 - vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 - - vpandn \r1, \r3, %ymm1 - vpand \r3, \r2, %ymm2 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - - vpslld $7, \r3, %ymm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $14, %ymm1, %ymm1 - vpsrld $14, %ymm2, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $5, %ymm1, %ymm1 - vpxor %ymm1, \r0, \r0 - vpaddd \r0, %ymm6, %ymm6 - vpaddd %ymm6, \r4, \r0 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 - vpand \r6, \r5, %ymm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %ymm1 - vpxor \r4, %ymm1, %ymm1 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (4-16)*16(%rax), %xmm8, %xmm0 - vpslld $10, \r7, %ymm2 - vpsrld $2, \r7, \r4 - vpsrld $11, \r4, %ymm1 - vpxor %ymm2, \r4, \r4 - vpxor %ymm1, \r4, \r4 - vpslld $9, %ymm2, %ymm2 - vpsrld $9, %ymm1, %ymm1 - vpxor %ymm2, \r4, \r4 - vpxor %ymm1, \r4, \r4 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm2, \r4, \r4 - vpaddd %ymm6, \r4, \r4 -.endm + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 -.macro sha256_avx2_main_quadround i - sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 - sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 -.endm + vpaddd (4-7)*16(%rax), %xmm0, %xmm0 + vpaddd (4-6)*16(%rax), %xmm4, %xmm4 -#endif /* USE_AVX2 */ + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, (4+1)*16(%rax) + vmovdqa (6-15)*16(%rax), %xmm0 + vmovdqa (6-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 -#if defined(USE_XOP) + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (6-16)*16(%rax), %xmm8, %xmm0 -.macro sha256_xop_extend_round i - vmovdqa (\i-15)*16(%rax), %xmm0 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm0, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (6-7)*16(%rax), %xmm0, %xmm0 + vpaddd (6-6)*16(%rax), %xmm4, %xmm4 - vprotd $15, %xmm3, %xmm1 - vprotd $13, %xmm3, %xmm2 - vpsrld $10, %xmm3, %xmm3 - vpxor %xmm1, %xmm2, %xmm2 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 vpxor %xmm2, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, \i*16(%rax) -.endm + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 -.macro sha256_xop_extend_doubleround i - vmovdqa (\i-15)*16(%rax), %xmm0 - vmovdqa (\i-14)*16(%rax), %xmm4 - vprotd $25, %xmm0, %xmm1 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm0, %xmm2 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, (6+1)*16(%rax) + vmovdqa (8-15)*16(%rax), %xmm0 + vmovdqa (8-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 vpsrld $3, %xmm4, %xmm4 - vpxor %xmm2, %xmm0, %xmm0 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 vpxor %xmm6, %xmm4, %xmm4 - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (8-16)*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 vpsrld $10, %xmm3, %xmm3 vpsrld $10, %xmm7, %xmm7 + + vpaddd (8-7)*16(%rax), %xmm0, %xmm0 + vpaddd (8-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 vpxor %xmm2, %xmm3, %xmm3 vpxor %xmm6, %xmm7, %xmm7 vpaddd %xmm0, %xmm3, %xmm3 vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, \i*16(%rax) - vmovdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 16*(\i)(%rax), \r0, %xmm6 - vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 - - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, (8+1)*16(%rax) + vmovdqa (10-15)*16(%rax), %xmm0 + vmovdqa (10-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 - vprotd $26, \r3, %xmm1 - vprotd $21, \r3, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $7, \r3, \r0 - vpxor %xmm2, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (10-16)*16(%rax), %xmm8, %xmm0 - vpand \r6, \r5, %xmm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %xmm1 - vpxor \r4, %xmm1, %xmm1 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vprotd $30, \r7, %xmm1 - vprotd $19, \r7, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $10, \r7, \r4 - vpxor %xmm2, \r4, \r4 - vpaddd %xmm6, \r4, \r4 -.endm + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 -.macro sha256_xop_main_quadround i - sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 -.endm + vpaddd (10-7)*16(%rax), %xmm0, %xmm0 + vpaddd (10-6)*16(%rax), %xmm4, %xmm4 -#endif /* USE_XOP */ + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, (10+1)*16(%rax) + vmovdqa (12-15)*16(%rax), %xmm0 + vmovdqa (12-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 - .text - .p2align 6 -sha256_transform_4way_core_sse2: - leaq 256(%rsp), %rcx - leaq 48*16(%rcx), %rax - movdqa -2*16(%rcx), %xmm3 - movdqa -1*16(%rcx), %xmm7 -sha256_transform_4way_sse2_extend_loop: - movdqa -15*16(%rcx), %xmm0 - movdqa -14*16(%rcx), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (12-16)*16(%rax), %xmm8, %xmm0 - paddd -16*16(%rcx), %xmm0 - paddd -15*16(%rcx), %xmm4 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 + vpaddd (12-7)*16(%rax), %xmm0, %xmm0 + vpaddd (12-6)*16(%rax), %xmm4, %xmm4 - paddd -7*16(%rcx), %xmm0 - paddd -6*16(%rcx), %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, (12+1)*16(%rax) + vmovdqa (14-15)*16(%rax), %xmm0 + vmovdqa (14-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, (%rcx) - movdqa %xmm7, 16(%rcx) - addq $2*16, %rcx - cmpq %rcx, %rax - jne sha256_transform_4way_sse2_extend_loop - - movdqu 0(%rdi), %xmm7 - movdqu 16(%rdi), %xmm5 - movdqu 32(%rdi), %xmm4 - movdqu 48(%rdi), %xmm3 - movdqu 64(%rdi), %xmm0 - movdqu 80(%rdi), %xmm8 - movdqu 96(%rdi), %xmm9 - movdqu 112(%rdi), %xmm10 - - leaq sha256_4k(%rip), %rcx - xorq %rax, %rax -sha256_transform_4way_sse2_main_loop: - movdqa (%rsp, %rax), %xmm6 - paddd (%rcx, %rax), %xmm6 - paddd %xmm10, %xmm6 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (14-16)*16(%rax), %xmm8, %xmm0 - movdqa %xmm0, %xmm1 - movdqa %xmm9, %xmm2 - pandn %xmm2, %xmm1 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 - movdqa %xmm2, %xmm10 - movdqa %xmm8, %xmm2 - movdqa %xmm2, %xmm9 + vpaddd (14-7)*16(%rax), %xmm0, %xmm0 + vpaddd (14-6)*16(%rax), %xmm4, %xmm4 - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, %xmm8 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 - paddd %xmm1, %xmm6 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, (14+1)*16(%rax) + vmovdqa (16-15)*16(%rax), %xmm0 + vmovdqa (16-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm0, %xmm6 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (16-16)*16(%rax), %xmm8, %xmm0 - movdqa %xmm3, %xmm0 - paddd %xmm6, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 - movdqa %xmm5, %xmm1 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - pand %xmm5, %xmm2 - pand %xmm7, %xmm4 - pand %xmm7, %xmm1 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 + vpaddd (16-7)*16(%rax), %xmm0, %xmm0 + vpaddd (16-6)*16(%rax), %xmm4, %xmm4 - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pxor %xmm1, %xmm7 - pslld $9, %xmm2 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pxor %xmm1, %xmm7 - pslld $11, %xmm2 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 - - addq $16, %rax - cmpq $16*64, %rax - jne sha256_transform_4way_sse2_main_loop - jmp sha256_transform_4way_finish + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 16*16(%rax) + vmovdqa %xmm7, (16+1)*16(%rax) + vmovdqa (18-15)*16(%rax), %xmm0 + vmovdqa (18-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 -#if defined(USE_AVX) - .text - .p2align 6 -sha256_transform_4way_core_avx: - leaq 256(%rsp), %rax - movdqa -2*16(%rax), %xmm3 - movdqa -1*16(%rax), %xmm7 - sha256_avx_extend_doubleround 0 - sha256_avx_extend_doubleround 2 - sha256_avx_extend_doubleround 4 - sha256_avx_extend_doubleround 6 - sha256_avx_extend_doubleround 8 - sha256_avx_extend_doubleround 10 - sha256_avx_extend_doubleround 12 - sha256_avx_extend_doubleround 14 - sha256_avx_extend_doubleround 16 - sha256_avx_extend_doubleround 18 - sha256_avx_extend_doubleround 20 - sha256_avx_extend_doubleround 22 - sha256_avx_extend_doubleround 24 - sha256_avx_extend_doubleround 26 - sha256_avx_extend_doubleround 28 - sha256_avx_extend_doubleround 30 - sha256_avx_extend_doubleround 32 - sha256_avx_extend_doubleround 34 - sha256_avx_extend_doubleround 36 - sha256_avx_extend_doubleround 38 - sha256_avx_extend_doubleround 40 - sha256_avx_extend_doubleround 42 - sha256_avx_extend_doubleround 44 - sha256_avx_extend_doubleround 46 - movdqu 0(%rdi), %xmm7 - movdqu 16(%rdi), %xmm5 - movdqu 32(%rdi), %xmm4 - movdqu 48(%rdi), %xmm3 - movdqu 64(%rdi), %xmm0 - movdqu 80(%rdi), %xmm8 - movdqu 96(%rdi), %xmm9 - movdqu 112(%rdi), %xmm10 - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - sha256_avx_main_quadround 0 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_quadround 56 - sha256_avx_main_quadround 60 - jmp sha256_transform_4way_finish -#endif /* USE_AVX */ + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (18-16)*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 -#if defined(USE_XOP) - .text - .p2align 6 -sha256_transform_4way_core_xop: - leaq 256(%rsp), %rax - movdqa -2*16(%rax), %xmm3 - movdqa -1*16(%rax), %xmm7 - sha256_xop_extend_doubleround 0 - sha256_xop_extend_doubleround 2 - sha256_xop_extend_doubleround 4 - sha256_xop_extend_doubleround 6 - sha256_xop_extend_doubleround 8 - sha256_xop_extend_doubleround 10 - sha256_xop_extend_doubleround 12 - sha256_xop_extend_doubleround 14 - sha256_xop_extend_doubleround 16 - sha256_xop_extend_doubleround 18 - sha256_xop_extend_doubleround 20 - sha256_xop_extend_doubleround 22 - sha256_xop_extend_doubleround 24 - sha256_xop_extend_doubleround 26 - sha256_xop_extend_doubleround 28 - sha256_xop_extend_doubleround 30 - sha256_xop_extend_doubleround 32 - sha256_xop_extend_doubleround 34 - sha256_xop_extend_doubleround 36 - sha256_xop_extend_doubleround 38 - sha256_xop_extend_doubleround 40 - sha256_xop_extend_doubleround 42 - sha256_xop_extend_doubleround 44 - sha256_xop_extend_doubleround 46 - movdqu 0(%rdi), %xmm7 - movdqu 16(%rdi), %xmm5 - movdqu 32(%rdi), %xmm4 - movdqu 48(%rdi), %xmm3 - movdqu 64(%rdi), %xmm0 - movdqu 80(%rdi), %xmm8 - movdqu 96(%rdi), %xmm9 - movdqu 112(%rdi), %xmm10 - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - sha256_xop_main_quadround 0 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_quadround 56 - sha256_xop_main_quadround 60 - jmp sha256_transform_4way_finish -#endif /* USE_XOP */ + vpaddd (18-7)*16(%rax), %xmm0, %xmm0 + vpaddd (18-6)*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 - .data - .p2align 3 -sha256_transform_4way_core_addr: - .quad 0x0 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 18*16(%rax) + vmovdqa %xmm7, (18+1)*16(%rax) + vmovdqa (20-15)*16(%rax), %xmm0 + vmovdqa (20-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 -.macro p2bswap_rsi_rsp i - movdqu \i*16(%rsi), %xmm0 - movdqu (\i+1)*16(%rsi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, \i*16(%rsp) - movdqa %xmm2, (\i+1)*16(%rsp) -.endm - - .text - .p2align 6 - .globl sha256_transform_4way - .globl _sha256_transform_4way -sha256_transform_4way: -_sha256_transform_4way: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - subq $96, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm9, 48(%rsp) - movdqa %xmm10, 64(%rsp) - movdqa %xmm11, 80(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx -#endif - movq %rsp, %r8 - subq $1032, %rsp - andq $-128, %rsp - - testq %rdx, %rdx - jnz sha256_transform_4way_swap - - movdqu 0*16(%rsi), %xmm0 - movdqu 1*16(%rsi), %xmm1 - movdqu 2*16(%rsi), %xmm2 - movdqu 3*16(%rsi), %xmm3 - movdqu 4*16(%rsi), %xmm4 - movdqu 5*16(%rsi), %xmm5 - movdqu 6*16(%rsi), %xmm6 - movdqu 7*16(%rsi), %xmm7 - movdqa %xmm0, 0*16(%rsp) - movdqa %xmm1, 1*16(%rsp) - movdqa %xmm2, 2*16(%rsp) - movdqa %xmm3, 3*16(%rsp) - movdqa %xmm4, 4*16(%rsp) - movdqa %xmm5, 5*16(%rsp) - movdqa %xmm6, 6*16(%rsp) - movdqa %xmm7, 7*16(%rsp) - movdqu 8*16(%rsi), %xmm0 - movdqu 9*16(%rsi), %xmm1 - movdqu 10*16(%rsi), %xmm2 - movdqu 11*16(%rsi), %xmm3 - movdqu 12*16(%rsi), %xmm4 - movdqu 13*16(%rsi), %xmm5 - movdqu 14*16(%rsi), %xmm6 - movdqu 15*16(%rsi), %xmm7 - movdqa %xmm0, 8*16(%rsp) - movdqa %xmm1, 9*16(%rsp) - movdqa %xmm2, 10*16(%rsp) - movdqa %xmm3, 11*16(%rsp) - movdqa %xmm4, 12*16(%rsp) - movdqa %xmm5, 13*16(%rsp) - movdqa %xmm6, 14*16(%rsp) - movdqa %xmm7, 15*16(%rsp) - jmp *sha256_transform_4way_core_addr(%rip) - - .p2align 6 -sha256_transform_4way_swap: - p2bswap_rsi_rsp 0 - p2bswap_rsi_rsp 2 - p2bswap_rsi_rsp 4 - p2bswap_rsi_rsp 6 - p2bswap_rsi_rsp 8 - p2bswap_rsi_rsp 10 - p2bswap_rsi_rsp 12 - p2bswap_rsi_rsp 14 - jmp *sha256_transform_4way_core_addr(%rip) - - .p2align 6 -sha256_transform_4way_finish: - movdqu 0(%rdi), %xmm2 - movdqu 16(%rdi), %xmm6 - movdqu 32(%rdi), %xmm11 - movdqu 48(%rdi), %xmm1 - paddd %xmm2, %xmm7 - paddd %xmm6, %xmm5 - paddd %xmm11, %xmm4 - paddd %xmm1, %xmm3 - movdqu 64(%rdi), %xmm2 - movdqu 80(%rdi), %xmm6 - movdqu 96(%rdi), %xmm11 - movdqu 112(%rdi), %xmm1 - paddd %xmm2, %xmm0 - paddd %xmm6, %xmm8 - paddd %xmm11, %xmm9 - paddd %xmm1, %xmm10 - - movdqu %xmm7, 0(%rdi) - movdqu %xmm5, 16(%rdi) - movdqu %xmm4, 32(%rdi) - movdqu %xmm3, 48(%rdi) - movdqu %xmm0, 64(%rdi) - movdqu %xmm8, 80(%rdi) - movdqu %xmm9, 96(%rdi) - movdqu %xmm10, 112(%rdi) - - movq %r8, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm9 - movdqa 64(%rsp), %xmm10 - movdqa 80(%rsp), %xmm11 - addq $96, %rsp - popq %rdi -#endif - ret + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (20-16)*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 -#ifdef USE_AVX2 + vpaddd (20-7)*16(%rax), %xmm0, %xmm0 + vpaddd (20-6)*16(%rax), %xmm4, %xmm4 - .text - .p2align 6 -sha256_transform_8way_core_avx2: - leaq 8*64(%rsp), %rax - vmovdqa -2*32(%rax), %ymm3 - vmovdqa -1*32(%rax), %ymm7 - sha256_avx2_extend_doubleround 0 - sha256_avx2_extend_doubleround 2 - sha256_avx2_extend_doubleround 4 - sha256_avx2_extend_doubleround 6 - sha256_avx2_extend_doubleround 8 - sha256_avx2_extend_doubleround 10 - sha256_avx2_extend_doubleround 12 - sha256_avx2_extend_doubleround 14 - sha256_avx2_extend_doubleround 16 - sha256_avx2_extend_doubleround 18 - sha256_avx2_extend_doubleround 20 - sha256_avx2_extend_doubleround 22 - sha256_avx2_extend_doubleround 24 - sha256_avx2_extend_doubleround 26 - sha256_avx2_extend_doubleround 28 - sha256_avx2_extend_doubleround 30 - sha256_avx2_extend_doubleround 32 - sha256_avx2_extend_doubleround 34 - sha256_avx2_extend_doubleround 36 - sha256_avx2_extend_doubleround 38 - sha256_avx2_extend_doubleround 40 - sha256_avx2_extend_doubleround 42 - sha256_avx2_extend_doubleround 44 - sha256_avx2_extend_doubleround 46 - vmovdqu 0*32(%rdi), %ymm7 - vmovdqu 1*32(%rdi), %ymm5 - vmovdqu 2*32(%rdi), %ymm4 - vmovdqu 3*32(%rdi), %ymm3 - vmovdqu 4*32(%rdi), %ymm0 - vmovdqu 5*32(%rdi), %ymm8 - vmovdqu 6*32(%rdi), %ymm9 - vmovdqu 7*32(%rdi), %ymm10 - movq %rsp, %rax - leaq sha256_8k(%rip), %rcx - sha256_avx2_main_quadround 0 - sha256_avx2_main_quadround 4 - sha256_avx2_main_quadround 8 - sha256_avx2_main_quadround 12 - sha256_avx2_main_quadround 16 - sha256_avx2_main_quadround 20 - sha256_avx2_main_quadround 24 - sha256_avx2_main_quadround 28 - sha256_avx2_main_quadround 32 - sha256_avx2_main_quadround 36 - sha256_avx2_main_quadround 40 - sha256_avx2_main_quadround 44 - sha256_avx2_main_quadround 48 - sha256_avx2_main_quadround 52 - sha256_avx2_main_quadround 56 - sha256_avx2_main_quadround 60 - jmp sha256_transform_8way_finish + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 -.macro p2bswap_avx2_rsi_rsp i - vmovdqu \i*32(%rsi), %ymm0 - vmovdqu (\i+1)*32(%rsi), %ymm2 - vpshuflw $0xb1, %ymm0, %ymm0 - vpshuflw $0xb1, %ymm2, %ymm2 - vpshufhw $0xb1, %ymm0, %ymm0 - vpshufhw $0xb1, %ymm2, %ymm2 - vpsrlw $8, %ymm0, %ymm1 - vpsrlw $8, %ymm2, %ymm3 - vpsllw $8, %ymm0, %ymm0 - vpsllw $8, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm3, %ymm2, %ymm2 - vmovdqa %ymm0, \i*32(%rsp) - vmovdqa %ymm2, (\i+1)*32(%rsp) -.endm - - .text - .p2align 6 - .globl sha256_transform_8way - .globl _sha256_transform_8way -sha256_transform_8way: -_sha256_transform_8way: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - subq $96, %rsp - vmovdqa %xmm6, 0(%rsp) - vmovdqa %xmm7, 16(%rsp) - vmovdqa %xmm8, 32(%rsp) - vmovdqa %xmm9, 48(%rsp) - vmovdqa %xmm10, 64(%rsp) - vmovdqa %xmm11, 80(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx -#endif - movq %rsp, %r8 - subq $64*32, %rsp - andq $-128, %rsp - - testq %rdx, %rdx - jnz sha256_transform_8way_swap - - vmovdqu 0*32(%rsi), %ymm0 - vmovdqu 1*32(%rsi), %ymm1 - vmovdqu 2*32(%rsi), %ymm2 - vmovdqu 3*32(%rsi), %ymm3 - vmovdqu 4*32(%rsi), %ymm4 - vmovdqu 5*32(%rsi), %ymm5 - vmovdqu 6*32(%rsi), %ymm6 - vmovdqu 7*32(%rsi), %ymm7 - vmovdqa %ymm0, 0*32(%rsp) - vmovdqa %ymm1, 1*32(%rsp) - vmovdqa %ymm2, 2*32(%rsp) - vmovdqa %ymm3, 3*32(%rsp) - vmovdqa %ymm4, 4*32(%rsp) - vmovdqa %ymm5, 5*32(%rsp) - vmovdqa %ymm6, 6*32(%rsp) - vmovdqa %ymm7, 7*32(%rsp) - vmovdqu 8*32(%rsi), %ymm0 - vmovdqu 9*32(%rsi), %ymm1 - vmovdqu 10*32(%rsi), %ymm2 - vmovdqu 11*32(%rsi), %ymm3 - vmovdqu 12*32(%rsi), %ymm4 - vmovdqu 13*32(%rsi), %ymm5 - vmovdqu 14*32(%rsi), %ymm6 - vmovdqu 15*32(%rsi), %ymm7 - vmovdqa %ymm0, 8*32(%rsp) - vmovdqa %ymm1, 9*32(%rsp) - vmovdqa %ymm2, 10*32(%rsp) - vmovdqa %ymm3, 11*32(%rsp) - vmovdqa %ymm4, 12*32(%rsp) - vmovdqa %ymm5, 13*32(%rsp) - vmovdqa %ymm6, 14*32(%rsp) - vmovdqa %ymm7, 15*32(%rsp) - jmp sha256_transform_8way_core_avx2 - - .p2align 6 -sha256_transform_8way_swap: - p2bswap_avx2_rsi_rsp 0 - p2bswap_avx2_rsi_rsp 2 - p2bswap_avx2_rsi_rsp 4 - p2bswap_avx2_rsi_rsp 6 - p2bswap_avx2_rsi_rsp 8 - p2bswap_avx2_rsi_rsp 10 - p2bswap_avx2_rsi_rsp 12 - p2bswap_avx2_rsi_rsp 14 - jmp sha256_transform_8way_core_avx2 - - .p2align 6 -sha256_transform_8way_finish: - vmovdqu 0*32(%rdi), %ymm2 - vmovdqu 1*32(%rdi), %ymm6 - vmovdqu 2*32(%rdi), %ymm11 - vmovdqu 3*32(%rdi), %ymm1 - vpaddd %ymm2, %ymm7, %ymm7 - vpaddd %ymm6, %ymm5, %ymm5 - vpaddd %ymm11, %ymm4, %ymm4 - vpaddd %ymm1, %ymm3, %ymm3 - vmovdqu 4*32(%rdi), %ymm2 - vmovdqu 5*32(%rdi), %ymm6 - vmovdqu 6*32(%rdi), %ymm11 - vmovdqu 7*32(%rdi), %ymm1 - vpaddd %ymm2, %ymm0, %ymm0 - vpaddd %ymm6, %ymm8, %ymm8 - vpaddd %ymm11, %ymm9, %ymm9 - vpaddd %ymm1, %ymm10, %ymm10 - - vmovdqu %ymm7, 0*32(%rdi) - vmovdqu %ymm5, 1*32(%rdi) - vmovdqu %ymm4, 2*32(%rdi) - vmovdqu %ymm3, 3*32(%rdi) - vmovdqu %ymm0, 4*32(%rdi) - vmovdqu %ymm8, 5*32(%rdi) - vmovdqu %ymm9, 6*32(%rdi) - vmovdqu %ymm10, 7*32(%rdi) - - movq %r8, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - vmovdqa 0(%rsp), %xmm6 - vmovdqa 16(%rsp), %xmm7 - vmovdqa 32(%rsp), %xmm8 - vmovdqa 48(%rsp), %xmm9 - vmovdqa 64(%rsp), %xmm10 - vmovdqa 80(%rsp), %xmm11 - addq $96, %rsp - popq %rdi -#endif - ret + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 20*16(%rax) + vmovdqa %xmm7, (20+1)*16(%rax) + vmovdqa (22-15)*16(%rax), %xmm0 + vmovdqa (22-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 -#endif /* USE_AVX2 */ - - - .data - .p2align 3 -sha256d_ms_4way_addr: - .quad 0x0 - - .text - .p2align 6 - .globl sha256d_ms_4way - .globl _sha256d_ms_4way -sha256d_ms_4way: -_sha256d_ms_4way: - jmp *sha256d_ms_4way_addr(%rip) - - - .p2align 6 -sha256d_ms_4way_sse2: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - subq $32, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - subq $8+67*16, %rsp - - leaq 256(%rsi), %rax - -sha256d_ms_4way_sse2_extend_loop1: - movdqa 3*16(%rsi), %xmm0 - movdqa 2*16(%rax), %xmm3 - movdqa 3*16(%rax), %xmm7 - movdqa %xmm3, 5*16(%rsp) - movdqa %xmm7, 6*16(%rsp) - movdqa %xmm0, %xmm2 - paddd %xmm0, %xmm7 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd %xmm0, %xmm3 - movdqa %xmm3, 2*16(%rax) - movdqa %xmm7, 3*16(%rax) - - movdqa 4*16(%rax), %xmm0 - movdqa %xmm0, 7*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - movdqa %xmm3, 4*16(%rax) - movdqa %xmm7, 5*16(%rax) - - movdqa 6*16(%rax), %xmm0 - movdqa 7*16(%rax), %xmm4 - movdqa %xmm0, 9*16(%rsp) - movdqa %xmm4, 10*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (22-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (22-7)*16(%rax), %xmm0, %xmm0 + vpaddd (22-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 22*16(%rax) + vmovdqa %xmm7, (22+1)*16(%rax) + vmovdqa (24-15)*16(%rax), %xmm0 + vmovdqa (24-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (24-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (24-7)*16(%rax), %xmm0, %xmm0 + vpaddd (24-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 24*16(%rax) + vmovdqa %xmm7, (24+1)*16(%rax) + vmovdqa (26-15)*16(%rax), %xmm0 + vmovdqa (26-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (26-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (26-7)*16(%rax), %xmm0, %xmm0 + vpaddd (26-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 26*16(%rax) + vmovdqa %xmm7, (26+1)*16(%rax) + vmovdqa (28-15)*16(%rax), %xmm0 + vmovdqa (28-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (28-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (28-7)*16(%rax), %xmm0, %xmm0 + vpaddd (28-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 28*16(%rax) + vmovdqa %xmm7, (28+1)*16(%rax) + vmovdqa (30-15)*16(%rax), %xmm0 + vmovdqa (30-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (30-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (30-7)*16(%rax), %xmm0, %xmm0 + vpaddd (30-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 30*16(%rax) + vmovdqa %xmm7, (30+1)*16(%rax) + vmovdqa (32-15)*16(%rax), %xmm0 + vmovdqa (32-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (32-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (32-7)*16(%rax), %xmm0, %xmm0 + vpaddd (32-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 32*16(%rax) + vmovdqa %xmm7, (32+1)*16(%rax) + vmovdqa (34-15)*16(%rax), %xmm0 + vmovdqa (34-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (34-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (34-7)*16(%rax), %xmm0, %xmm0 + vpaddd (34-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 34*16(%rax) + vmovdqa %xmm7, (34+1)*16(%rax) + vmovdqa (36-15)*16(%rax), %xmm0 + vmovdqa (36-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (36-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (36-7)*16(%rax), %xmm0, %xmm0 + vpaddd (36-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 36*16(%rax) + vmovdqa %xmm7, (36+1)*16(%rax) + vmovdqa (38-15)*16(%rax), %xmm0 + vmovdqa (38-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (38-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (38-7)*16(%rax), %xmm0, %xmm0 + vpaddd (38-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 38*16(%rax) + vmovdqa %xmm7, (38+1)*16(%rax) + vmovdqa (40-15)*16(%rax), %xmm0 + vmovdqa (40-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (40-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (40-7)*16(%rax), %xmm0, %xmm0 + vpaddd (40-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 40*16(%rax) + vmovdqa %xmm7, (40+1)*16(%rax) + vmovdqa (42-15)*16(%rax), %xmm0 + vmovdqa (42-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (42-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (42-7)*16(%rax), %xmm0, %xmm0 + vpaddd (42-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 42*16(%rax) + vmovdqa %xmm7, (42+1)*16(%rax) + vmovdqa (44-15)*16(%rax), %xmm0 + vmovdqa (44-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (44-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (44-7)*16(%rax), %xmm0, %xmm0 + vpaddd (44-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 44*16(%rax) + vmovdqa %xmm7, (44+1)*16(%rax) + vmovdqa (46-15)*16(%rax), %xmm0 + vmovdqa (46-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (46-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (46-7)*16(%rax), %xmm0, %xmm0 + vpaddd (46-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 46*16(%rax) + vmovdqa %xmm7, (46+1)*16(%rax) + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + vpaddd 16*(0+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(0+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(0+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(0+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(0+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(0+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(0+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(0+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(4+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(4+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(4+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(4+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(4+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(4+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(4+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(4+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(8+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(8+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(8+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(8+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(8+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(8+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(8+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(8+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(12+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(12+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(12+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(12+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(12+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(12+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(12+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(12+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(16+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(16+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(16+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(16+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(16+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(16+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(16+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(16+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(20+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(20+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(20+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(20+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(20+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(20+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(20+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(20+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(24+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(24+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(24+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(24+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(24+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(24+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(24+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(24+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(28+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(28+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(28+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(28+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(28+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(28+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(28+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(28+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(32+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(32+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(32+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(32+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(32+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(32+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(32+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(32+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(36+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(36+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(36+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(36+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(36+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(36+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(36+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(36+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(40+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(40+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(40+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(40+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(40+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(40+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(40+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(40+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(44+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(44+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(44+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(44+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(44+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(44+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(44+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(44+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(48+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(48+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(48+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(48+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(48+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(48+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(48+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(48+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(52+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(52+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(52+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(52+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(52+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(52+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(52+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(52+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(56+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(56+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(56+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(56+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(56+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(56+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(56+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(56+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(60+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(60+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(60+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(60+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(60+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(60+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(60+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(60+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + jmp sha256_transform_4way_finish +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + .text + .p2align 6 +sha256_transform_4way_core_xop: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + vmovdqa (0-15)*16(%rax), %xmm0 + vmovdqa (0-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (0-16)*16(%rax), %xmm0, %xmm0 + vpaddd (0-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (0-7)*16(%rax), %xmm0, %xmm0 + vpaddd (0-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, (0+1)*16(%rax) + vmovdqa (2-15)*16(%rax), %xmm0 + vmovdqa (2-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (2-16)*16(%rax), %xmm0, %xmm0 + vpaddd (2-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (2-7)*16(%rax), %xmm0, %xmm0 + vpaddd (2-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, (2+1)*16(%rax) + vmovdqa (4-15)*16(%rax), %xmm0 + vmovdqa (4-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (4-16)*16(%rax), %xmm0, %xmm0 + vpaddd (4-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (4-7)*16(%rax), %xmm0, %xmm0 + vpaddd (4-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, (4+1)*16(%rax) + vmovdqa (6-15)*16(%rax), %xmm0 + vmovdqa (6-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (6-16)*16(%rax), %xmm0, %xmm0 + vpaddd (6-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (6-7)*16(%rax), %xmm0, %xmm0 + vpaddd (6-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, (6+1)*16(%rax) + vmovdqa (8-15)*16(%rax), %xmm0 + vmovdqa (8-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (8-16)*16(%rax), %xmm0, %xmm0 + vpaddd (8-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (8-7)*16(%rax), %xmm0, %xmm0 + vpaddd (8-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, (8+1)*16(%rax) + vmovdqa (10-15)*16(%rax), %xmm0 + vmovdqa (10-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (10-16)*16(%rax), %xmm0, %xmm0 + vpaddd (10-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (10-7)*16(%rax), %xmm0, %xmm0 + vpaddd (10-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, (10+1)*16(%rax) + vmovdqa (12-15)*16(%rax), %xmm0 + vmovdqa (12-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (12-16)*16(%rax), %xmm0, %xmm0 + vpaddd (12-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (12-7)*16(%rax), %xmm0, %xmm0 + vpaddd (12-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, (12+1)*16(%rax) + vmovdqa (14-15)*16(%rax), %xmm0 + vmovdqa (14-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (14-16)*16(%rax), %xmm0, %xmm0 + vpaddd (14-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (14-7)*16(%rax), %xmm0, %xmm0 + vpaddd (14-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, (14+1)*16(%rax) + vmovdqa (16-15)*16(%rax), %xmm0 + vmovdqa (16-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (16-16)*16(%rax), %xmm0, %xmm0 + vpaddd (16-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (16-7)*16(%rax), %xmm0, %xmm0 + vpaddd (16-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 16*16(%rax) + vmovdqa %xmm7, (16+1)*16(%rax) + vmovdqa (18-15)*16(%rax), %xmm0 + vmovdqa (18-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (18-16)*16(%rax), %xmm0, %xmm0 + vpaddd (18-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (18-7)*16(%rax), %xmm0, %xmm0 + vpaddd (18-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 18*16(%rax) + vmovdqa %xmm7, (18+1)*16(%rax) + vmovdqa (20-15)*16(%rax), %xmm0 + vmovdqa (20-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (20-16)*16(%rax), %xmm0, %xmm0 + vpaddd (20-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (20-7)*16(%rax), %xmm0, %xmm0 + vpaddd (20-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 20*16(%rax) + vmovdqa %xmm7, (20+1)*16(%rax) + vmovdqa (22-15)*16(%rax), %xmm0 + vmovdqa (22-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (22-16)*16(%rax), %xmm0, %xmm0 + vpaddd (22-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (22-7)*16(%rax), %xmm0, %xmm0 + vpaddd (22-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 22*16(%rax) + vmovdqa %xmm7, (22+1)*16(%rax) + vmovdqa (24-15)*16(%rax), %xmm0 + vmovdqa (24-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (24-16)*16(%rax), %xmm0, %xmm0 + vpaddd (24-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (24-7)*16(%rax), %xmm0, %xmm0 + vpaddd (24-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 24*16(%rax) + vmovdqa %xmm7, (24+1)*16(%rax) + vmovdqa (26-15)*16(%rax), %xmm0 + vmovdqa (26-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (26-16)*16(%rax), %xmm0, %xmm0 + vpaddd (26-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (26-7)*16(%rax), %xmm0, %xmm0 + vpaddd (26-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 26*16(%rax) + vmovdqa %xmm7, (26+1)*16(%rax) + vmovdqa (28-15)*16(%rax), %xmm0 + vmovdqa (28-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (28-16)*16(%rax), %xmm0, %xmm0 + vpaddd (28-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (28-7)*16(%rax), %xmm0, %xmm0 + vpaddd (28-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 28*16(%rax) + vmovdqa %xmm7, (28+1)*16(%rax) + vmovdqa (30-15)*16(%rax), %xmm0 + vmovdqa (30-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (30-16)*16(%rax), %xmm0, %xmm0 + vpaddd (30-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (30-7)*16(%rax), %xmm0, %xmm0 + vpaddd (30-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 30*16(%rax) + vmovdqa %xmm7, (30+1)*16(%rax) + vmovdqa (32-15)*16(%rax), %xmm0 + vmovdqa (32-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (32-16)*16(%rax), %xmm0, %xmm0 + vpaddd (32-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (32-7)*16(%rax), %xmm0, %xmm0 + vpaddd (32-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 32*16(%rax) + vmovdqa %xmm7, (32+1)*16(%rax) + vmovdqa (34-15)*16(%rax), %xmm0 + vmovdqa (34-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (34-16)*16(%rax), %xmm0, %xmm0 + vpaddd (34-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (34-7)*16(%rax), %xmm0, %xmm0 + vpaddd (34-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 34*16(%rax) + vmovdqa %xmm7, (34+1)*16(%rax) + vmovdqa (36-15)*16(%rax), %xmm0 + vmovdqa (36-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (36-16)*16(%rax), %xmm0, %xmm0 + vpaddd (36-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (36-7)*16(%rax), %xmm0, %xmm0 + vpaddd (36-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 36*16(%rax) + vmovdqa %xmm7, (36+1)*16(%rax) + vmovdqa (38-15)*16(%rax), %xmm0 + vmovdqa (38-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (38-16)*16(%rax), %xmm0, %xmm0 + vpaddd (38-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (38-7)*16(%rax), %xmm0, %xmm0 + vpaddd (38-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 38*16(%rax) + vmovdqa %xmm7, (38+1)*16(%rax) + vmovdqa (40-15)*16(%rax), %xmm0 + vmovdqa (40-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (40-16)*16(%rax), %xmm0, %xmm0 + vpaddd (40-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (40-7)*16(%rax), %xmm0, %xmm0 + vpaddd (40-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 40*16(%rax) + vmovdqa %xmm7, (40+1)*16(%rax) + vmovdqa (42-15)*16(%rax), %xmm0 + vmovdqa (42-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (42-16)*16(%rax), %xmm0, %xmm0 + vpaddd (42-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (42-7)*16(%rax), %xmm0, %xmm0 + vpaddd (42-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 42*16(%rax) + vmovdqa %xmm7, (42+1)*16(%rax) + vmovdqa (44-15)*16(%rax), %xmm0 + vmovdqa (44-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (44-16)*16(%rax), %xmm0, %xmm0 + vpaddd (44-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (44-7)*16(%rax), %xmm0, %xmm0 + vpaddd (44-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 44*16(%rax) + vmovdqa %xmm7, (44+1)*16(%rax) + vmovdqa (46-15)*16(%rax), %xmm0 + vmovdqa (46-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (46-16)*16(%rax), %xmm0, %xmm0 + vpaddd (46-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (46-7)*16(%rax), %xmm0, %xmm0 + vpaddd (46-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 46*16(%rax) + vmovdqa %xmm7, (46+1)*16(%rax) + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + vpaddd 16*(0+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(0+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(0+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(0+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(0+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(0+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(0+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(0+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(4+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(4+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(4+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(4+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(4+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(4+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(4+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(4+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(8+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(8+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(8+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(8+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(8+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(8+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(8+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(8+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(12+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(12+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(12+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(12+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(12+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(12+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(12+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(12+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(16+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(16+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(16+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(16+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(16+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(16+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(16+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(16+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(20+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(20+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(20+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(20+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(20+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(20+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(20+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(20+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(24+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(24+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(24+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(24+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(24+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(24+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(24+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(24+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(28+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(28+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(28+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(28+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(28+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(28+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(28+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(28+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(32+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(32+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(32+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(32+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(32+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(32+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(32+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(32+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(36+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(36+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(36+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(36+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(36+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(36+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(36+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(36+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(40+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(40+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(40+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(40+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(40+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(40+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(40+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(40+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(44+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(44+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(44+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(44+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(44+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(44+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(44+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(44+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(48+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(48+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(48+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(48+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(48+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(48+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(48+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(48+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(52+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(52+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(52+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(52+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(52+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(52+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(52+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(52+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(56+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(56+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(56+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(56+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(56+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(56+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(56+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(56+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(60+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(60+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(60+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(60+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(60+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(60+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(60+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(60+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + jmp sha256_transform_4way_finish +#endif /* USE_XOP */ + + + .data + .p2align 3 +sha256_transform_4way_core_addr: + .quad 0x0 + + + .text + .p2align 6 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $1032, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_4way_swap + + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqu 4*16(%rsi), %xmm4 + movdqu 5*16(%rsi), %xmm5 + movdqu 6*16(%rsi), %xmm6 + movdqu 7*16(%rsi), %xmm7 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + movdqa %xmm4, 4*16(%rsp) + movdqa %xmm5, 5*16(%rsp) + movdqa %xmm6, 6*16(%rsp) + movdqa %xmm7, 7*16(%rsp) + movdqu 8*16(%rsi), %xmm0 + movdqu 9*16(%rsi), %xmm1 + movdqu 10*16(%rsi), %xmm2 + movdqu 11*16(%rsi), %xmm3 + movdqu 12*16(%rsi), %xmm4 + movdqu 13*16(%rsi), %xmm5 + movdqu 14*16(%rsi), %xmm6 + movdqu 15*16(%rsi), %xmm7 + movdqa %xmm0, 8*16(%rsp) + movdqa %xmm1, 9*16(%rsp) + movdqa %xmm2, 10*16(%rsp) + movdqa %xmm3, 11*16(%rsp) + movdqa %xmm4, 12*16(%rsp) + movdqa %xmm5, 13*16(%rsp) + movdqa %xmm6, 14*16(%rsp) + movdqa %xmm7, 15*16(%rsp) + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_swap: + movdqu 0*16(%rsi), %xmm0 + movdqu (0+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm2, (0+1)*16(%rsp) + movdqu 2*16(%rsi), %xmm0 + movdqu (2+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 2*16(%rsp) + movdqa %xmm2, (2+1)*16(%rsp) + movdqu 4*16(%rsi), %xmm0 + movdqu (4+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 4*16(%rsp) + movdqa %xmm2, (4+1)*16(%rsp) + movdqu 6*16(%rsi), %xmm0 + movdqu (6+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 6*16(%rsp) + movdqa %xmm2, (6+1)*16(%rsp) + movdqu 8*16(%rsi), %xmm0 + movdqu (8+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 8*16(%rsp) + movdqa %xmm2, (8+1)*16(%rsp) + movdqu 10*16(%rsi), %xmm0 + movdqu (10+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 10*16(%rsp) + movdqa %xmm2, (10+1)*16(%rsp) + movdqu 12*16(%rsi), %xmm0 + movdqu (12+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 12*16(%rsp) + movdqa %xmm2, (12+1)*16(%rsp) + movdqu 14*16(%rsi), %xmm0 + movdqu (14+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 14*16(%rsp) + movdqa %xmm2, (14+1)*16(%rsp) + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_finish: + movdqu 0(%rdi), %xmm2 + movdqu 16(%rdi), %xmm6 + movdqu 32(%rdi), %xmm11 + movdqu 48(%rdi), %xmm1 + paddd %xmm2, %xmm7 + paddd %xmm6, %xmm5 + paddd %xmm11, %xmm4 + paddd %xmm1, %xmm3 + movdqu 64(%rdi), %xmm2 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm11 + movdqu 112(%rdi), %xmm1 + paddd %xmm2, %xmm0 + paddd %xmm6, %xmm8 + paddd %xmm11, %xmm9 + paddd %xmm1, %xmm10 + + movdqu %xmm7, 0(%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm4, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm8, 80(%rdi) + movdqu %xmm9, 96(%rdi) + movdqu %xmm10, 112(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + + +#ifdef USE_AVX2 + + .text + .p2align 6 +sha256_transform_8way_core_avx2: + leaq 8*64(%rsp), %rax + vmovdqa -2*32(%rax), %ymm3 + vmovdqa -1*32(%rax), %ymm7 + vmovdqa (0-15)*32(%rax), %ymm0 + vmovdqa (0-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (0-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (0-7)*32(%rax), %ymm0, %ymm0 + vpaddd (0-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 0*32(%rax) + vmovdqa %ymm7, (0+1)*32(%rax) + vmovdqa (2-15)*32(%rax), %ymm0 + vmovdqa (2-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (2-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (2-7)*32(%rax), %ymm0, %ymm0 + vpaddd (2-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, (2+1)*32(%rax) + vmovdqa (4-15)*32(%rax), %ymm0 + vmovdqa (4-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (4-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (4-7)*32(%rax), %ymm0, %ymm0 + vpaddd (4-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, (4+1)*32(%rax) + vmovdqa (6-15)*32(%rax), %ymm0 + vmovdqa (6-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (6-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (6-7)*32(%rax), %ymm0, %ymm0 + vpaddd (6-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, (6+1)*32(%rax) + vmovdqa (8-15)*32(%rax), %ymm0 + vmovdqa (8-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (8-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (8-7)*32(%rax), %ymm0, %ymm0 + vpaddd (8-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, (8+1)*32(%rax) + vmovdqa (10-15)*32(%rax), %ymm0 + vmovdqa (10-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (10-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (10-7)*32(%rax), %ymm0, %ymm0 + vpaddd (10-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, (10+1)*32(%rax) + vmovdqa (12-15)*32(%rax), %ymm0 + vmovdqa (12-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (12-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (12-7)*32(%rax), %ymm0, %ymm0 + vpaddd (12-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, (12+1)*32(%rax) + vmovdqa (14-15)*32(%rax), %ymm0 + vmovdqa (14-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (14-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (14-7)*32(%rax), %ymm0, %ymm0 + vpaddd (14-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, (14+1)*32(%rax) + vmovdqa (16-15)*32(%rax), %ymm0 + vmovdqa (16-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (16-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (16-7)*32(%rax), %ymm0, %ymm0 + vpaddd (16-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 16*32(%rax) + vmovdqa %ymm7, (16+1)*32(%rax) + vmovdqa (18-15)*32(%rax), %ymm0 + vmovdqa (18-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (18-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (18-7)*32(%rax), %ymm0, %ymm0 + vpaddd (18-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 18*32(%rax) + vmovdqa %ymm7, (18+1)*32(%rax) + vmovdqa (20-15)*32(%rax), %ymm0 + vmovdqa (20-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (20-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (20-7)*32(%rax), %ymm0, %ymm0 + vpaddd (20-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 20*32(%rax) + vmovdqa %ymm7, (20+1)*32(%rax) + vmovdqa (22-15)*32(%rax), %ymm0 + vmovdqa (22-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (22-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (22-7)*32(%rax), %ymm0, %ymm0 + vpaddd (22-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 22*32(%rax) + vmovdqa %ymm7, (22+1)*32(%rax) + vmovdqa (24-15)*32(%rax), %ymm0 + vmovdqa (24-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (24-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (24-7)*32(%rax), %ymm0, %ymm0 + vpaddd (24-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 24*32(%rax) + vmovdqa %ymm7, (24+1)*32(%rax) + vmovdqa (26-15)*32(%rax), %ymm0 + vmovdqa (26-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (26-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (26-7)*32(%rax), %ymm0, %ymm0 + vpaddd (26-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 26*32(%rax) + vmovdqa %ymm7, (26+1)*32(%rax) + vmovdqa (28-15)*32(%rax), %ymm0 + vmovdqa (28-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (28-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (28-7)*32(%rax), %ymm0, %ymm0 + vpaddd (28-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 28*32(%rax) + vmovdqa %ymm7, (28+1)*32(%rax) + vmovdqa (30-15)*32(%rax), %ymm0 + vmovdqa (30-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (30-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (30-7)*32(%rax), %ymm0, %ymm0 + vpaddd (30-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 30*32(%rax) + vmovdqa %ymm7, (30+1)*32(%rax) + vmovdqa (32-15)*32(%rax), %ymm0 + vmovdqa (32-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (32-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (32-7)*32(%rax), %ymm0, %ymm0 + vpaddd (32-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 32*32(%rax) + vmovdqa %ymm7, (32+1)*32(%rax) + vmovdqa (34-15)*32(%rax), %ymm0 + vmovdqa (34-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (34-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (34-7)*32(%rax), %ymm0, %ymm0 + vpaddd (34-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 34*32(%rax) + vmovdqa %ymm7, (34+1)*32(%rax) + vmovdqa (36-15)*32(%rax), %ymm0 + vmovdqa (36-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (36-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (36-7)*32(%rax), %ymm0, %ymm0 + vpaddd (36-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 36*32(%rax) + vmovdqa %ymm7, (36+1)*32(%rax) + vmovdqa (38-15)*32(%rax), %ymm0 + vmovdqa (38-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (38-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (38-7)*32(%rax), %ymm0, %ymm0 + vpaddd (38-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 38*32(%rax) + vmovdqa %ymm7, (38+1)*32(%rax) + vmovdqa (40-15)*32(%rax), %ymm0 + vmovdqa (40-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (40-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (40-7)*32(%rax), %ymm0, %ymm0 + vpaddd (40-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 40*32(%rax) + vmovdqa %ymm7, (40+1)*32(%rax) + vmovdqa (42-15)*32(%rax), %ymm0 + vmovdqa (42-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (42-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (42-7)*32(%rax), %ymm0, %ymm0 + vpaddd (42-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 42*32(%rax) + vmovdqa %ymm7, (42+1)*32(%rax) + vmovdqa (44-15)*32(%rax), %ymm0 + vmovdqa (44-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (44-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (44-7)*32(%rax), %ymm0, %ymm0 + vpaddd (44-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 44*32(%rax) + vmovdqa %ymm7, (44+1)*32(%rax) + vmovdqa (46-15)*32(%rax), %ymm0 + vmovdqa (46-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (46-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (46-7)*32(%rax), %ymm0, %ymm0 + vpaddd (46-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 46*32(%rax) + vmovdqa %ymm7, (46+1)*32(%rax) + vmovdqu 0*32(%rdi), %ymm7 + vmovdqu 1*32(%rdi), %ymm5 + vmovdqu 2*32(%rdi), %ymm4 + vmovdqu 3*32(%rdi), %ymm3 + vmovdqu 4*32(%rdi), %ymm0 + vmovdqu 5*32(%rdi), %ymm8 + vmovdqu 6*32(%rdi), %ymm9 + vmovdqu 7*32(%rdi), %ymm10 + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + vpaddd 32*(0+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(0+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(0+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(0+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(0+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(0+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(0+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(0+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(4+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(4+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(4+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(4+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(4+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(4+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(4+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(4+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(8+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(8+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(8+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(8+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(8+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(8+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(8+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(8+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(12+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(12+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(12+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(12+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(12+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(12+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(12+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(12+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(16+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(16+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(16+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(16+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(16+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(16+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(16+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(16+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(20+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(20+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(20+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(20+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(20+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(20+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(20+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(20+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(24+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(24+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(24+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(24+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(24+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(24+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(24+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(24+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(28+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(28+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(28+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(28+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(28+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(28+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(28+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(28+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(32+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(32+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(32+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(32+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(32+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(32+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(32+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(32+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(36+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(36+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(36+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(36+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(36+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(36+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(36+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(36+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(40+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(40+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(40+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(40+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(40+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(40+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(40+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(40+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(44+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(44+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(44+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(44+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(44+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(44+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(44+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(44+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(48+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(48+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(48+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(48+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(48+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(48+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(48+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(48+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(52+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(52+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(52+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(52+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(52+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(52+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(52+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(52+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(56+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(56+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(56+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(56+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(56+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(56+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(56+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(56+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(60+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(60+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(60+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(60+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(60+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(60+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(60+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(60+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + jmp sha256_transform_8way_finish + + + .text + .p2align 6 + .globl sha256_transform_8way + .globl _sha256_transform_8way +sha256_transform_8way: +_sha256_transform_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + vmovdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64*32, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_8way_swap + + vmovdqu 0*32(%rsi), %ymm0 + vmovdqu 1*32(%rsi), %ymm1 + vmovdqu 2*32(%rsi), %ymm2 + vmovdqu 3*32(%rsi), %ymm3 + vmovdqu 4*32(%rsi), %ymm4 + vmovdqu 5*32(%rsi), %ymm5 + vmovdqu 6*32(%rsi), %ymm6 + vmovdqu 7*32(%rsi), %ymm7 + vmovdqa %ymm0, 0*32(%rsp) + vmovdqa %ymm1, 1*32(%rsp) + vmovdqa %ymm2, 2*32(%rsp) + vmovdqa %ymm3, 3*32(%rsp) + vmovdqa %ymm4, 4*32(%rsp) + vmovdqa %ymm5, 5*32(%rsp) + vmovdqa %ymm6, 6*32(%rsp) + vmovdqa %ymm7, 7*32(%rsp) + vmovdqu 8*32(%rsi), %ymm0 + vmovdqu 9*32(%rsi), %ymm1 + vmovdqu 10*32(%rsi), %ymm2 + vmovdqu 11*32(%rsi), %ymm3 + vmovdqu 12*32(%rsi), %ymm4 + vmovdqu 13*32(%rsi), %ymm5 + vmovdqu 14*32(%rsi), %ymm6 + vmovdqu 15*32(%rsi), %ymm7 + vmovdqa %ymm0, 8*32(%rsp) + vmovdqa %ymm1, 9*32(%rsp) + vmovdqa %ymm2, 10*32(%rsp) + vmovdqa %ymm3, 11*32(%rsp) + vmovdqa %ymm4, 12*32(%rsp) + vmovdqa %ymm5, 13*32(%rsp) + vmovdqa %ymm6, 14*32(%rsp) + vmovdqa %ymm7, 15*32(%rsp) + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_swap: + vmovdqu 0*32(%rsi), %ymm0 + vmovdqu (0+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 0*32(%rsp) + vmovdqa %ymm2, (0+1)*32(%rsp) + vmovdqu 2*32(%rsi), %ymm0 + vmovdqu (2+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 2*32(%rsp) + vmovdqa %ymm2, (2+1)*32(%rsp) + vmovdqu 4*32(%rsi), %ymm0 + vmovdqu (4+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 4*32(%rsp) + vmovdqa %ymm2, (4+1)*32(%rsp) + vmovdqu 6*32(%rsi), %ymm0 + vmovdqu (6+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 6*32(%rsp) + vmovdqa %ymm2, (6+1)*32(%rsp) + vmovdqu 8*32(%rsi), %ymm0 + vmovdqu (8+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 8*32(%rsp) + vmovdqa %ymm2, (8+1)*32(%rsp) + vmovdqu 10*32(%rsi), %ymm0 + vmovdqu (10+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 10*32(%rsp) + vmovdqa %ymm2, (10+1)*32(%rsp) + vmovdqu 12*32(%rsi), %ymm0 + vmovdqu (12+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 12*32(%rsp) + vmovdqa %ymm2, (12+1)*32(%rsp) + vmovdqu 14*32(%rsi), %ymm0 + vmovdqu (14+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm2, (14+1)*32(%rsp) + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_finish: + vmovdqu 0*32(%rdi), %ymm2 + vmovdqu 1*32(%rdi), %ymm6 + vmovdqu 2*32(%rdi), %ymm11 + vmovdqu 3*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd %ymm11, %ymm4, %ymm4 + vpaddd %ymm1, %ymm3, %ymm3 + vmovdqu 4*32(%rdi), %ymm2 + vmovdqu 5*32(%rdi), %ymm6 + vmovdqu 6*32(%rdi), %ymm11 + vmovdqu 7*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm0, %ymm0 + vpaddd %ymm6, %ymm8, %ymm8 + vpaddd %ymm11, %ymm9, %ymm9 + vpaddd %ymm1, %ymm10, %ymm10 + + vmovdqu %ymm7, 0*32(%rdi) + vmovdqu %ymm5, 1*32(%rdi) + vmovdqu %ymm4, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm8, 5*32(%rdi) + vmovdqu %ymm9, 6*32(%rdi) + vmovdqu %ymm10, 7*32(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + vmovdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX2 */ + + + .data + .p2align 3 +sha256d_ms_4way_addr: + .quad 0x0 + + .text + .p2align 6 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + jmp *sha256d_ms_4way_addr(%rip) + + + .p2align 6 +sha256d_ms_4way_sse2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $32, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $8+67*16, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_sse2_extend_loop1: + movdqa 3*16(%rsi), %xmm0 + movdqa 2*16(%rax), %xmm3 + movdqa 3*16(%rax), %xmm7 + movdqa %xmm3, 5*16(%rsp) + movdqa %xmm7, 6*16(%rsp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%rax) + movdqa %xmm7, 3*16(%rax) + + movdqa 4*16(%rax), %xmm0 + movdqa %xmm0, 7*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%rax) + movdqa %xmm7, 5*16(%rax) + + movdqa 6*16(%rax), %xmm0 + movdqa 7*16(%rax), %xmm4 + movdqa %xmm0, 9*16(%rsp) + movdqa %xmm4, 10*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa 8*16(%rax), %xmm0 + movdqa 2*16(%rax), %xmm4 + movdqa %xmm0, 11*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa 14*16(%rax), %xmm0 + movdqa 15*16(%rax), %xmm4 + movdqa %xmm0, 17*16(%rsp) + movdqa %xmm4, 18*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + paddd 8*16(%rax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_sse2_extend_loop2: + movdqa (16-15)*16(%rax), %xmm0 + movdqa (16-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (16-16)*16(%rax), %xmm0 + paddd (16-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (16-7)*16(%rax), %xmm0 + paddd (16-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 16*16(%rax) + movdqa %xmm7, (16+1)*16(%rax) + movdqa (18-15)*16(%rax), %xmm0 + movdqa (18-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (18-16)*16(%rax), %xmm0 + paddd (18-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (18-7)*16(%rax), %xmm0 + paddd (18-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 18*16(%rax) + movdqa %xmm7, (18+1)*16(%rax) + movdqa (20-15)*16(%rax), %xmm0 + movdqa (20-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (20-16)*16(%rax), %xmm0 + paddd (20-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (20-7)*16(%rax), %xmm0 + paddd (20-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 20*16(%rax) + movdqa %xmm7, (20+1)*16(%rax) + movdqa (22-15)*16(%rax), %xmm0 + movdqa (22-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (22-16)*16(%rax), %xmm0 + paddd (22-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (22-7)*16(%rax), %xmm0 + paddd (22-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 22*16(%rax) + movdqa %xmm7, (22+1)*16(%rax) + movdqa (24-15)*16(%rax), %xmm0 + movdqa (24-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (24-16)*16(%rax), %xmm0 + paddd (24-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (24-7)*16(%rax), %xmm0 + paddd (24-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 24*16(%rax) + movdqa %xmm7, (24+1)*16(%rax) + movdqa (26-15)*16(%rax), %xmm0 + movdqa (26-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (26-16)*16(%rax), %xmm0 + paddd (26-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (26-7)*16(%rax), %xmm0 + paddd (26-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 26*16(%rax) + movdqa %xmm7, (26+1)*16(%rax) + movdqa (28-15)*16(%rax), %xmm0 + movdqa (28-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (28-16)*16(%rax), %xmm0 + paddd (28-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (28-7)*16(%rax), %xmm0 + paddd (28-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 28*16(%rax) + movdqa %xmm7, (28+1)*16(%rax) + movdqa (30-15)*16(%rax), %xmm0 + movdqa (30-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (30-16)*16(%rax), %xmm0 + paddd (30-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (30-7)*16(%rax), %xmm0 + paddd (30-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 30*16(%rax) + movdqa %xmm7, (30+1)*16(%rax) + movdqa (32-15)*16(%rax), %xmm0 + movdqa (32-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (32-16)*16(%rax), %xmm0 + paddd (32-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (32-7)*16(%rax), %xmm0 + paddd (32-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 32*16(%rax) + movdqa %xmm7, (32+1)*16(%rax) + movdqa (34-15)*16(%rax), %xmm0 + movdqa (34-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (34-16)*16(%rax), %xmm0 + paddd (34-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (34-7)*16(%rax), %xmm0 + paddd (34-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 34*16(%rax) + movdqa %xmm7, (34+1)*16(%rax) + movdqa (36-15)*16(%rax), %xmm0 + movdqa (36-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (36-16)*16(%rax), %xmm0 + paddd (36-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (36-7)*16(%rax), %xmm0 + paddd (36-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 36*16(%rax) + movdqa %xmm7, (36+1)*16(%rax) + movdqa (38-15)*16(%rax), %xmm0 + movdqa (38-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (38-16)*16(%rax), %xmm0 + paddd (38-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (38-7)*16(%rax), %xmm0 + paddd (38-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 38*16(%rax) + movdqa %xmm7, (38+1)*16(%rax) + movdqa (40-15)*16(%rax), %xmm0 + movdqa (40-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (40-16)*16(%rax), %xmm0 + paddd (40-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (40-7)*16(%rax), %xmm0 + paddd (40-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 40*16(%rax) + movdqa %xmm7, (40+1)*16(%rax) + movdqa (42-15)*16(%rax), %xmm0 + movdqa (42-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (42-16)*16(%rax), %xmm0 + paddd (42-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (42-7)*16(%rax), %xmm0 + paddd (42-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 42*16(%rax) + movdqa %xmm7, (42+1)*16(%rax) + jz sha256d_ms_4way_sse2_extend_coda2 + movdqa (44-15)*16(%rax), %xmm0 + movdqa (44-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (44-16)*16(%rax), %xmm0 + paddd (44-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (44-7)*16(%rax), %xmm0 + paddd (44-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 44*16(%rax) + movdqa %xmm7, (44+1)*16(%rax) + movdqa (46-15)*16(%rax), %xmm0 + movdqa (46-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (46-16)*16(%rax), %xmm0 + paddd (46-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (46-7)*16(%rax), %xmm0 + paddd (46-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 46*16(%rax) + movdqa %xmm7, (46+1)*16(%rax) + + movdqa 0(%rcx), %xmm3 + movdqa 16(%rcx), %xmm0 + movdqa 32(%rcx), %xmm1 + movdqa 48(%rcx), %xmm2 + movdqa 64(%rcx), %xmm6 + movdqa 80(%rcx), %xmm7 + movdqa 96(%rcx), %xmm5 + movdqa 112(%rcx), %xmm4 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop1 + +sha256d_ms_4way_sse2_main_loop2: + movdqa 16*(0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +sha256d_ms_4way_sse2_main_loop1: + movdqa 16*(3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(56)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(56)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + jz sha256d_ms_4way_sse2_finish + movdqa 16*(57)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(57)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(58)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(58)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(59)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(59)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+0)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+0)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+1)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+1)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+2)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+2)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+3)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+3)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + movdqa 5*16(%rsp), %xmm1 + movdqa 6*16(%rsp), %xmm2 + movdqa 7*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 9*16(%rsp), %xmm1 + movdqa 10*16(%rsp), %xmm2 + movdqa 11*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 17*16(%rsp), %xmm1 + movdqa 18*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + movdqa 0(%rsp), %xmm1 + movdqa 16(%rsp), %xmm2 + movdqa 32(%rsp), %xmm6 + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm1 + paddd 96(%rdx), %xmm2 + paddd 112(%rdx), %xmm6 + + movdqa %xmm7, 48+0(%rsp) + movdqa %xmm5, 48+16(%rsp) + movdqa %xmm4, 48+32(%rsp) + movdqa %xmm3, 48+48(%rsp) + movdqa %xmm0, 48+64(%rsp) + movdqa %xmm1, 48+80(%rsp) + movdqa %xmm2, 48+96(%rsp) + movdqa %xmm6, 48+112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 48+128(%rsp) + movdqa %xmm0, 48+144(%rsp) + movdqa %xmm0, 48+160(%rsp) + movdqa %xmm0, 48+176(%rsp) + movdqa %xmm0, 48+192(%rsp) + movdqa %xmm0, 48+208(%rsp) + movdqa %xmm0, 48+224(%rsp) + movdqa %xmm1, 48+240(%rsp) + + leaq 19*16(%rsp), %rax + cmpq %rax, %rax + + movdqa -15*16(%rax), %xmm0 + movdqa -14*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%rax), %xmm0 + paddd -15*16(%rax), %xmm4 + paddd sha256d_4preext2_17(%rip), %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%rax) + movdqa %xmm7, 1*16(%rax) + + movdqa (2-15)*16(%rax), %xmm0 + movdqa (2-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (2-16)*16(%rax), %xmm0 + paddd (2-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (2-7)*16(%rax), %xmm0 + paddd (2-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 2*16(%rax) + movdqa %xmm7, (2+1)*16(%rax) + movdqa (4-15)*16(%rax), %xmm0 + movdqa (4-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (4-16)*16(%rax), %xmm0 + paddd (4-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (4-7)*16(%rax), %xmm0 + paddd (4-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 4*16(%rax) + movdqa %xmm7, (4+1)*16(%rax) + + movdqa -9*16(%rax), %xmm0 + movdqa sha256d_4preext2_23(%rip), %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%rax), %xmm0 + paddd -9*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%rax), %xmm0 pslld $13, %xmm2 pslld $13, %xmm6 psrld $7, %xmm1 psrld $7, %xmm5 + paddd 0*16(%rax), %xmm4 pxor %xmm1, %xmm3 pxor %xmm5, %xmm7 psrld $2, %xmm1 @@ -2076,767 +18149,7004 @@ sha256d_ms_4way_sse2_extend_loop1: movdqa %xmm3, 6*16(%rax) movdqa %xmm7, 7*16(%rax) - movdqa 8*16(%rax), %xmm0 - movdqa 2*16(%rax), %xmm4 - movdqa %xmm0, 11*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 8*16(%rax) - movdqa %xmm7, 9*16(%rax) + movdqa sha256d_4preext2_24(%rip), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%rax), %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa sha256d_4preext2_30(%rip), %xmm0 + movdqa 0*16(%rax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_sse2_extend_loop2 + +sha256d_ms_4way_sse2_extend_coda2: + movdqa (44-15)*16(%rax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (44-16)*16(%rax), %xmm0 + paddd (44-7)*16(%rax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, 44*16(%rax) + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm6 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + leaq 48(%rsp), %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop2 + + +sha256d_ms_4way_sse2_finish: + movdqa 16*57(%rax), %xmm6 + paddd 16*57(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd %xmm3, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + movdqa 16*58(%rax), %xmm6 + paddd 16*58(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd %xmm4, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + movdqa 16*59(%rax), %xmm6 + paddd 16*59(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd %xmm5, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + movdqa 16*60(%rax), %xmm6 + paddd 16*60(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd %xmm7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + + paddd sha256_4h+112(%rip), %xmm0 + movdqa %xmm0, 112(%rdi) + + addq $8+67*16, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + addq $32, %rsp + popq %rdi +#endif + ret + + +#if defined(USE_AVX) + + .p2align 6 +sha256d_ms_4way_avx: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_avx_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_avx_extend_loop2: + vmovdqa (16-15)*16(%rax), %xmm0 + vmovdqa (16-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (16-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (16-7)*16(%rax), %xmm0, %xmm0 + vpaddd (16-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 16*16(%rax) + vmovdqa %xmm7, (16+1)*16(%rax) + vmovdqa (18-15)*16(%rax), %xmm0 + vmovdqa (18-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (18-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (18-7)*16(%rax), %xmm0, %xmm0 + vpaddd (18-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 18*16(%rax) + vmovdqa %xmm7, (18+1)*16(%rax) + vmovdqa (20-15)*16(%rax), %xmm0 + vmovdqa (20-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (20-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (20-7)*16(%rax), %xmm0, %xmm0 + vpaddd (20-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 20*16(%rax) + vmovdqa %xmm7, (20+1)*16(%rax) + vmovdqa (22-15)*16(%rax), %xmm0 + vmovdqa (22-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (22-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (22-7)*16(%rax), %xmm0, %xmm0 + vpaddd (22-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 22*16(%rax) + vmovdqa %xmm7, (22+1)*16(%rax) + vmovdqa (24-15)*16(%rax), %xmm0 + vmovdqa (24-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (24-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (24-7)*16(%rax), %xmm0, %xmm0 + vpaddd (24-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 24*16(%rax) + vmovdqa %xmm7, (24+1)*16(%rax) + vmovdqa (26-15)*16(%rax), %xmm0 + vmovdqa (26-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (26-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (26-7)*16(%rax), %xmm0, %xmm0 + vpaddd (26-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 26*16(%rax) + vmovdqa %xmm7, (26+1)*16(%rax) + vmovdqa (28-15)*16(%rax), %xmm0 + vmovdqa (28-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (28-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (28-7)*16(%rax), %xmm0, %xmm0 + vpaddd (28-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 28*16(%rax) + vmovdqa %xmm7, (28+1)*16(%rax) + vmovdqa (30-15)*16(%rax), %xmm0 + vmovdqa (30-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (30-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (30-7)*16(%rax), %xmm0, %xmm0 + vpaddd (30-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 30*16(%rax) + vmovdqa %xmm7, (30+1)*16(%rax) + vmovdqa (32-15)*16(%rax), %xmm0 + vmovdqa (32-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (32-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (32-7)*16(%rax), %xmm0, %xmm0 + vpaddd (32-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 32*16(%rax) + vmovdqa %xmm7, (32+1)*16(%rax) + vmovdqa (34-15)*16(%rax), %xmm0 + vmovdqa (34-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (34-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (34-7)*16(%rax), %xmm0, %xmm0 + vpaddd (34-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 34*16(%rax) + vmovdqa %xmm7, (34+1)*16(%rax) + vmovdqa (36-15)*16(%rax), %xmm0 + vmovdqa (36-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (36-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (36-7)*16(%rax), %xmm0, %xmm0 + vpaddd (36-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 36*16(%rax) + vmovdqa %xmm7, (36+1)*16(%rax) + vmovdqa (38-15)*16(%rax), %xmm0 + vmovdqa (38-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (38-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (38-7)*16(%rax), %xmm0, %xmm0 + vpaddd (38-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 38*16(%rax) + vmovdqa %xmm7, (38+1)*16(%rax) + vmovdqa (40-15)*16(%rax), %xmm0 + vmovdqa (40-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (40-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (40-7)*16(%rax), %xmm0, %xmm0 + vpaddd (40-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 40*16(%rax) + vmovdqa %xmm7, (40+1)*16(%rax) + vmovdqa (42-15)*16(%rax), %xmm0 + vmovdqa (42-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (42-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (42-7)*16(%rax), %xmm0, %xmm0 + vpaddd (42-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 42*16(%rax) + vmovdqa %xmm7, (42+1)*16(%rax) + jz sha256d_ms_4way_avx_extend_coda2 + vmovdqa (44-15)*16(%rax), %xmm0 + vmovdqa (44-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (44-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (44-7)*16(%rax), %xmm0, %xmm0 + vpaddd (44-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 44*16(%rax) + vmovdqa %xmm7, (44+1)*16(%rax) + vmovdqa (46-15)*16(%rax), %xmm0 + vmovdqa (46-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (46-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (46-7)*16(%rax), %xmm0, %xmm0 + vpaddd (46-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 46*16(%rax) + vmovdqa %xmm7, (46+1)*16(%rax) + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop1 + +sha256d_ms_4way_avx_main_loop2: + vpaddd 16*(0)(%rax), %xmm10, %xmm6 + vpaddd 16*(0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(1)(%rax), %xmm9, %xmm6 + vpaddd 16*(1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(2)(%rax), %xmm8, %xmm6 + vpaddd 16*(2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 +sha256d_ms_4way_avx_main_loop1: + vpaddd 16*(3)(%rax), %xmm0, %xmm6 + vpaddd 16*(3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(4+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(4+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(4+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(4+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(4+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(4+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(4+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(4+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(8+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(8+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(8+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(8+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(8+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(8+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(8+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(8+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(12+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(12+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(12+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(12+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(12+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(12+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(12+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(12+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(16+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(16+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(16+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(16+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(16+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(16+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(16+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(16+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(20+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(20+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(20+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(20+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(20+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(20+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(20+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(20+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(24+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(24+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(24+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(24+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(24+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(24+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(24+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(24+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(28+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(28+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(28+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(28+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(28+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(28+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(28+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(28+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(32+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(32+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(32+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(32+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(32+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(32+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(32+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(32+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(36+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(36+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(36+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(36+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(36+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(36+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(36+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(36+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(40+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(40+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(40+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(40+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(40+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(40+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(40+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(40+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(44+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(44+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(44+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(44+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(44+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(44+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(44+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(44+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(48+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(48+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(48+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(48+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(48+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(48+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(48+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(48+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(52+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(52+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(52+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(52+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(52+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(52+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(52+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(52+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(56)(%rax), %xmm10, %xmm6 + vpaddd 16*(56)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + jz sha256d_ms_4way_avx_finish + vpaddd 16*(57)(%rax), %xmm9, %xmm6 + vpaddd 16*(57)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(58)(%rax), %xmm8, %xmm6 + vpaddd 16*(58)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(59)(%rax), %xmm0, %xmm6 + vpaddd 16*(59)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(60+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(60+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $11, %xmm3, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(60+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(60+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm3, %xmm2 + vpsrld $2, %xmm3, %xmm4 + vpsrld $11, %xmm4, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(60+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(60+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm5 + vpsrld $11, %xmm5, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm5, %xmm5 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(60+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(60+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, %xmm5, %xmm2 + vpsrld $2, %xmm5, %xmm7 + vpsrld $11, %xmm7, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm7, %xmm7 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 3*16(%rax), %xmm3 - paddd 4*16(%rax), %xmm7 - movdqa %xmm3, 10*16(%rax) - movdqa %xmm7, 11*16(%rax) + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + vmovdqa (2-15)*16(%rax), %xmm0 + vmovdqa (2-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (2-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (2-7)*16(%rax), %xmm0, %xmm0 + vpaddd (2-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, (2+1)*16(%rax) + vmovdqa (4-15)*16(%rax), %xmm0 + vmovdqa (4-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (4-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (4-7)*16(%rax), %xmm0, %xmm0 + vpaddd (4-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, (4+1)*16(%rax) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 5*16(%rax), %xmm3 - paddd 6*16(%rax), %xmm7 - movdqa %xmm3, 12*16(%rax) - movdqa %xmm7, 13*16(%rax) + vmovdqa -9*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpsrld $7, %xmm0, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) - movdqa 14*16(%rax), %xmm0 - movdqa 15*16(%rax), %xmm4 - movdqa %xmm0, 17*16(%rsp) - movdqa %xmm4, 18*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 7*16(%rax), %xmm0 - paddd 8*16(%rax), %xmm4 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 14*16(%rax) - movdqa %xmm7, 15*16(%rax) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) -sha256d_ms_4way_sse2_extend_loop2: - sha256_sse2_extend_doubleround 16 - sha256_sse2_extend_doubleround 18 - sha256_sse2_extend_doubleround 20 - sha256_sse2_extend_doubleround 22 - sha256_sse2_extend_doubleround 24 - sha256_sse2_extend_doubleround 26 - sha256_sse2_extend_doubleround 28 - sha256_sse2_extend_doubleround 30 - sha256_sse2_extend_doubleround 32 - sha256_sse2_extend_doubleround 34 - sha256_sse2_extend_doubleround 36 - sha256_sse2_extend_doubleround 38 - sha256_sse2_extend_doubleround 40 - sha256_sse2_extend_doubleround 42 - jz sha256d_ms_4way_sse2_extend_coda2 - sha256_sse2_extend_doubleround 44 - sha256_sse2_extend_doubleround 46 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) - movdqa 0(%rcx), %xmm3 - movdqa 16(%rcx), %xmm0 - movdqa 32(%rcx), %xmm1 - movdqa 48(%rcx), %xmm2 - movdqa 64(%rcx), %xmm6 - movdqa 80(%rcx), %xmm7 - movdqa 96(%rcx), %xmm5 - movdqa 112(%rcx), %xmm4 - movdqa %xmm1, 0(%rsp) - movdqa %xmm2, 16(%rsp) - movdqa %xmm6, 32(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) - movq %rsi, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_sse2_main_loop1 + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpsrld $11, %xmm5, %xmm5 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) -sha256d_ms_4way_sse2_main_loop2: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 -sha256d_ms_4way_sse2_main_loop1: - sha256_sse2_main_round 3 - sha256_sse2_main_quadround 4 - sha256_sse2_main_quadround 8 - sha256_sse2_main_quadround 12 - sha256_sse2_main_quadround 16 - sha256_sse2_main_quadround 20 - sha256_sse2_main_quadround 24 - sha256_sse2_main_quadround 28 - sha256_sse2_main_quadround 32 - sha256_sse2_main_quadround 36 - sha256_sse2_main_quadround 40 - sha256_sse2_main_quadround 44 - sha256_sse2_main_quadround 48 - sha256_sse2_main_quadround 52 - sha256_sse2_main_round 56 - jz sha256d_ms_4way_sse2_finish - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_quadround 60 + jmp sha256d_ms_4way_avx_extend_loop2 - movdqa 5*16(%rsp), %xmm1 - movdqa 6*16(%rsp), %xmm2 - movdqa 7*16(%rsp), %xmm6 - movdqa %xmm1, 18*16(%rsi) - movdqa %xmm2, 19*16(%rsi) - movdqa %xmm6, 20*16(%rsi) - movdqa 9*16(%rsp), %xmm1 - movdqa 10*16(%rsp), %xmm2 - movdqa 11*16(%rsp), %xmm6 - movdqa %xmm1, 22*16(%rsi) - movdqa %xmm2, 23*16(%rsi) - movdqa %xmm6, 24*16(%rsi) - movdqa 17*16(%rsp), %xmm1 - movdqa 18*16(%rsp), %xmm2 - movdqa %xmm1, 30*16(%rsi) - movdqa %xmm2, 31*16(%rsi) +sha256d_ms_4way_avx_extend_coda2: + vmovdqa (44-15)*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd (44-16)*16(%rax), %xmm0, %xmm0 + vpaddd (44-7)*16(%rax), %xmm0, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpsrld $7, %xmm3, %xmm1 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpsrld $2, %xmm1, %xmm1 + vpslld $2, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 44*16(%rax) - movdqa 0(%rsp), %xmm1 - movdqa 16(%rsp), %xmm2 - movdqa 32(%rsp), %xmm6 - paddd 0(%rdx), %xmm7 - paddd 16(%rdx), %xmm5 - paddd 32(%rdx), %xmm4 - paddd 48(%rdx), %xmm3 - paddd 64(%rdx), %xmm0 - paddd 80(%rdx), %xmm1 - paddd 96(%rdx), %xmm2 - paddd 112(%rdx), %xmm6 + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 - movdqa %xmm7, 48+0(%rsp) - movdqa %xmm5, 48+16(%rsp) - movdqa %xmm4, 48+32(%rsp) - movdqa %xmm3, 48+48(%rsp) - movdqa %xmm0, 48+64(%rsp) - movdqa %xmm1, 48+80(%rsp) - movdqa %xmm2, 48+96(%rsp) - movdqa %xmm6, 48+112(%rsp) + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop2 + + +sha256d_ms_4way_avx_finish: + vpaddd 16*57(%rax), %xmm9, %xmm6 + vpaddd 16*57(%rcx), %xmm6, %xmm6 + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, %xmm10, %xmm1 + vpsrld $6, %xmm10, %xmm9 + vpsrld $5, %xmm9, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + vpaddd 16*58(%rax), %xmm8, %xmm6 + vpaddd 16*58(%rcx), %xmm6, %xmm6 + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, %xmm9, %xmm1 + vpsrld $6, %xmm9, %xmm8 + vpsrld $5, %xmm8, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + vpaddd 16*59(%rax), %xmm0, %xmm6 + vpaddd 16*59(%rcx), %xmm6, %xmm6 + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, %xmm8, %xmm1 + vpsrld $6, %xmm8, %xmm0 + vpsrld $5, %xmm0, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + vpaddd 16*60(%rax), %xmm10, %xmm6 + vpaddd 16*60(%rcx), %xmm6, %xmm6 + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, %xmm0, %xmm1 + vpsrld $6, %xmm0, %xmm10 + vpsrld $5, %xmm10, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, %xmm10, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 - pxor %xmm0, %xmm0 - movq $0x8000000000000100, %rax - movd %rax, %xmm1 - pshufd $0x55, %xmm1, %xmm2 - pshufd $0x00, %xmm1, %xmm1 - movdqa %xmm2, 48+128(%rsp) - movdqa %xmm0, 48+144(%rsp) - movdqa %xmm0, 48+160(%rsp) - movdqa %xmm0, 48+176(%rsp) - movdqa %xmm0, 48+192(%rsp) - movdqa %xmm0, 48+208(%rsp) - movdqa %xmm0, 48+224(%rsp) - movdqa %xmm1, 48+240(%rsp) + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) - leaq 19*16(%rsp), %rax - cmpq %rax, %rax + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret - movdqa -15*16(%rax), %xmm0 - movdqa -14*16(%rax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - paddd -16*16(%rax), %xmm0 - paddd -15*16(%rax), %xmm4 - paddd sha256d_4preext2_17(%rip), %xmm4 - movdqa %xmm0, %xmm3 - movdqa %xmm4, %xmm7 - movdqa %xmm3, 0*16(%rax) - movdqa %xmm7, 1*16(%rax) +#endif /* USE_AVX */ - sha256_sse2_extend_doubleround 2 - sha256_sse2_extend_doubleround 4 - movdqa -9*16(%rax), %xmm0 - movdqa sha256d_4preext2_23(%rip), %xmm4 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd -10*16(%rax), %xmm0 - paddd -9*16(%rax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd -1*16(%rax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - paddd 0*16(%rax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 6*16(%rax) - movdqa %xmm7, 7*16(%rax) +#if defined(USE_XOP) + + .p2align 6 +sha256d_ms_4way_xop: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp - movdqa sha256d_4preext2_24(%rip), %xmm0 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 1*16(%rax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd 2*16(%rax), %xmm7 - movdqa %xmm3, 8*16(%rax) - movdqa %xmm7, 9*16(%rax) + leaq 256(%rsi), %rax - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 3*16(%rax), %xmm3 - paddd 4*16(%rax), %xmm7 - movdqa %xmm3, 10*16(%rax) - movdqa %xmm7, 11*16(%rax) +sha256d_ms_4way_xop_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 5*16(%rax), %xmm3 - paddd 6*16(%rax), %xmm7 - movdqa %xmm3, 12*16(%rax) - movdqa %xmm7, 13*16(%rax) + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) - movdqa sha256d_4preext2_30(%rip), %xmm0 - movdqa 0*16(%rax), %xmm4 - movdqa %xmm4, %xmm6 - psrld $3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $14, %xmm6 - psrld $4, %xmm5 - pxor %xmm5, %xmm4 - pxor %xmm6, %xmm4 - psrld $11, %xmm5 - pslld $11, %xmm6 - pxor %xmm5, %xmm4 - pxor %xmm6, %xmm4 - paddd -1*16(%rax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 7*16(%rax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - paddd 8*16(%rax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 14*16(%rax) - movdqa %xmm7, 15*16(%rax) + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) - jmp sha256d_ms_4way_sse2_extend_loop2 + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) -sha256d_ms_4way_sse2_extend_coda2: - sha256_sse2_extend_round 44 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_xop_extend_loop2: + vmovdqa (16-15)*16(%rax), %xmm0 + vmovdqa (16-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (16-16)*16(%rax), %xmm0, %xmm0 + vpaddd (16-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (16-7)*16(%rax), %xmm0, %xmm0 + vpaddd (16-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 16*16(%rax) + vmovdqa %xmm7, (16+1)*16(%rax) + vmovdqa (18-15)*16(%rax), %xmm0 + vmovdqa (18-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (18-16)*16(%rax), %xmm0, %xmm0 + vpaddd (18-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (18-7)*16(%rax), %xmm0, %xmm0 + vpaddd (18-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 18*16(%rax) + vmovdqa %xmm7, (18+1)*16(%rax) + vmovdqa (20-15)*16(%rax), %xmm0 + vmovdqa (20-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (20-16)*16(%rax), %xmm0, %xmm0 + vpaddd (20-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (20-7)*16(%rax), %xmm0, %xmm0 + vpaddd (20-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 20*16(%rax) + vmovdqa %xmm7, (20+1)*16(%rax) + vmovdqa (22-15)*16(%rax), %xmm0 + vmovdqa (22-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (22-16)*16(%rax), %xmm0, %xmm0 + vpaddd (22-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (22-7)*16(%rax), %xmm0, %xmm0 + vpaddd (22-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 22*16(%rax) + vmovdqa %xmm7, (22+1)*16(%rax) + vmovdqa (24-15)*16(%rax), %xmm0 + vmovdqa (24-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (24-16)*16(%rax), %xmm0, %xmm0 + vpaddd (24-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (24-7)*16(%rax), %xmm0, %xmm0 + vpaddd (24-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 24*16(%rax) + vmovdqa %xmm7, (24+1)*16(%rax) + vmovdqa (26-15)*16(%rax), %xmm0 + vmovdqa (26-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (26-16)*16(%rax), %xmm0, %xmm0 + vpaddd (26-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (26-7)*16(%rax), %xmm0, %xmm0 + vpaddd (26-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 26*16(%rax) + vmovdqa %xmm7, (26+1)*16(%rax) + vmovdqa (28-15)*16(%rax), %xmm0 + vmovdqa (28-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (28-16)*16(%rax), %xmm0, %xmm0 + vpaddd (28-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (28-7)*16(%rax), %xmm0, %xmm0 + vpaddd (28-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 28*16(%rax) + vmovdqa %xmm7, (28+1)*16(%rax) + vmovdqa (30-15)*16(%rax), %xmm0 + vmovdqa (30-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (30-16)*16(%rax), %xmm0, %xmm0 + vpaddd (30-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (30-7)*16(%rax), %xmm0, %xmm0 + vpaddd (30-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 30*16(%rax) + vmovdqa %xmm7, (30+1)*16(%rax) + vmovdqa (32-15)*16(%rax), %xmm0 + vmovdqa (32-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (32-16)*16(%rax), %xmm0, %xmm0 + vpaddd (32-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (32-7)*16(%rax), %xmm0, %xmm0 + vpaddd (32-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 32*16(%rax) + vmovdqa %xmm7, (32+1)*16(%rax) + vmovdqa (34-15)*16(%rax), %xmm0 + vmovdqa (34-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (34-16)*16(%rax), %xmm0, %xmm0 + vpaddd (34-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (34-7)*16(%rax), %xmm0, %xmm0 + vpaddd (34-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 34*16(%rax) + vmovdqa %xmm7, (34+1)*16(%rax) + vmovdqa (36-15)*16(%rax), %xmm0 + vmovdqa (36-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (36-16)*16(%rax), %xmm0, %xmm0 + vpaddd (36-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (36-7)*16(%rax), %xmm0, %xmm0 + vpaddd (36-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 36*16(%rax) + vmovdqa %xmm7, (36+1)*16(%rax) + vmovdqa (38-15)*16(%rax), %xmm0 + vmovdqa (38-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (38-16)*16(%rax), %xmm0, %xmm0 + vpaddd (38-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (38-7)*16(%rax), %xmm0, %xmm0 + vpaddd (38-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 38*16(%rax) + vmovdqa %xmm7, (38+1)*16(%rax) + vmovdqa (40-15)*16(%rax), %xmm0 + vmovdqa (40-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (40-16)*16(%rax), %xmm0, %xmm0 + vpaddd (40-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (40-7)*16(%rax), %xmm0, %xmm0 + vpaddd (40-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 40*16(%rax) + vmovdqa %xmm7, (40+1)*16(%rax) + vmovdqa (42-15)*16(%rax), %xmm0 + vmovdqa (42-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (42-16)*16(%rax), %xmm0, %xmm0 + vpaddd (42-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (42-7)*16(%rax), %xmm0, %xmm0 + vpaddd (42-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 42*16(%rax) + vmovdqa %xmm7, (42+1)*16(%rax) + jz sha256d_ms_4way_xop_extend_coda2 + vmovdqa (44-15)*16(%rax), %xmm0 + vmovdqa (44-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (44-16)*16(%rax), %xmm0, %xmm0 + vpaddd (44-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (44-7)*16(%rax), %xmm0, %xmm0 + vpaddd (44-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 44*16(%rax) + vmovdqa %xmm7, (44+1)*16(%rax) + vmovdqa (46-15)*16(%rax), %xmm0 + vmovdqa (46-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (46-16)*16(%rax), %xmm0, %xmm0 + vpaddd (46-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (46-7)*16(%rax), %xmm0, %xmm0 + vpaddd (46-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 46*16(%rax) + vmovdqa %xmm7, (46+1)*16(%rax) - movdqa sha256_4h+0(%rip), %xmm7 - movdqa sha256_4h+16(%rip), %xmm5 - movdqa sha256_4h+32(%rip), %xmm4 - movdqa sha256_4h+48(%rip), %xmm3 - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm1 - movdqa sha256_4h+96(%rip), %xmm2 - movdqa sha256_4h+112(%rip), %xmm6 - movdqa %xmm1, 0(%rsp) - movdqa %xmm2, 16(%rsp) - movdqa %xmm6, 32(%rsp) + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 - leaq 48(%rsp), %rax + movq %rsi, %rax leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_sse2_main_loop2 + jmp sha256d_ms_4way_xop_main_loop1 + +sha256d_ms_4way_xop_main_loop2: + vpaddd 16*(0)(%rax), %xmm10, %xmm6 + vpaddd 16*(0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(1)(%rax), %xmm9, %xmm6 + vpaddd 16*(1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(2)(%rax), %xmm8, %xmm6 + vpaddd 16*(2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 +sha256d_ms_4way_xop_main_loop1: + vpaddd 16*(3)(%rax), %xmm0, %xmm6 + vpaddd 16*(3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(4+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(4+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(4+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(4+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 -.macro sha256_sse2_main_round_red i, r7 - movdqa 16*\i(%rax), %xmm6 - paddd 16*\i(%rcx), %xmm6 - paddd 32(%rsp), %xmm6 - movdqa %xmm0, %xmm1 - movdqa 16(%rsp), %xmm2 - paddd \r7, %xmm6 - pandn %xmm2, %xmm1 - movdqa %xmm2, 32(%rsp) - movdqa 0(%rsp), %xmm2 - movdqa %xmm2, 16(%rsp) - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%rsp) - paddd %xmm1, %xmm6 - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm6, %xmm0 -.endm + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 -sha256d_ms_4way_sse2_finish: - sha256_sse2_main_round_red 57, %xmm3 - sha256_sse2_main_round_red 58, %xmm4 - sha256_sse2_main_round_red 59, %xmm5 - sha256_sse2_main_round_red 60, %xmm7 - - paddd sha256_4h+112(%rip), %xmm0 - movdqa %xmm0, 112(%rdi) - - addq $8+67*16, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - addq $32, %rsp - popq %rdi -#endif - ret - - -#if defined(USE_AVX) - - .p2align 6 -sha256d_ms_4way_avx: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - subq $80, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm9, 48(%rsp) - movdqa %xmm10, 64(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - subq $1032, %rsp - - leaq 256(%rsi), %rax - -sha256d_ms_4way_avx_extend_loop1: - vmovdqa 3*16(%rsi), %xmm0 - vmovdqa 2*16(%rax), %xmm3 - vmovdqa 3*16(%rax), %xmm7 - vmovdqa %xmm3, 2*16(%rsp) - vmovdqa %xmm7, 3*16(%rsp) - vpaddd %xmm0, %xmm7, %xmm7 - vpslld $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpsrld $4, %xmm0, %xmm1 - vpxor %xmm1, %xmm0, %xmm0 + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(4+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(4+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(4+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(4+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 vpxor %xmm2, %xmm0, %xmm0 - vpsrld $11, %xmm1, %xmm1 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(8+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(8+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(8+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(8+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(8+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(8+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(8+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(8+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 vpxor %xmm2, %xmm0, %xmm0 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 2*16(%rax) - vmovdqa %xmm7, 3*16(%rax) - - vmovdqa 4*16(%rax), %xmm0 - vmovdqa %xmm0, 4*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(12+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(12+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(12+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(12+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(12+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(12+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(12+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(12+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(16+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(16+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(16+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(16+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(16+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(16+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(16+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(16+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(20+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(20+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 4*16(%rax) - vmovdqa %xmm7, 5*16(%rax) - - vmovdqa 6*16(%rax), %xmm0 - vmovdqa 7*16(%rax), %xmm4 - vmovdqa %xmm0, 6*16(%rsp) - vmovdqa %xmm4, 7*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(20+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(20+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(20+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(20+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(20+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(20+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(24+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(24+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(24+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(24+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(24+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(24+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(24+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(24+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(28+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(28+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vmovdqa 8*16(%rax), %xmm0 - vmovdqa 2*16(%rax), %xmm4 - vmovdqa %xmm0, 8*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(28+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(28+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(28+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(28+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(28+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(28+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(32+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(32+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(32+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(32+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(32+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(32+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(32+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(32+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(36+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(36+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(36+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(36+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(36+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(36+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(36+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(36+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(40+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(40+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(40+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(40+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(40+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(40+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(40+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(40+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(44+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(44+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(44+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(44+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(44+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(44+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(44+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(44+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(48+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(48+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(48+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(48+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(48+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(48+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(48+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(48+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(52+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(52+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa 14*16(%rax), %xmm0 - vmovdqa 15*16(%rax), %xmm4 - vmovdqa %xmm0, 14*16(%rsp) - vmovdqa %xmm4, 15*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(52+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(52+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(52+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(52+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(52+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(52+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(56)(%rax), %xmm10, %xmm6 + vpaddd 16*(56)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd %xmm6, %xmm3, %xmm3 + jz sha256d_ms_4way_xop_finish + vpaddd 16*(57)(%rax), %xmm9, %xmm6 + vpaddd 16*(57)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(58)(%rax), %xmm8, %xmm6 + vpaddd 16*(58)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(59)(%rax), %xmm0, %xmm6 + vpaddd 16*(59)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 + vpaddd 16*(60+0)(%rax), %xmm10, %xmm6 + vpaddd 16*(60+0)(%rcx), %xmm6, %xmm6 + + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + vpand %xmm5, %xmm4, %xmm2 + vpand %xmm7, %xmm4, %xmm3 + vpand %xmm7, %xmm5, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm7, %xmm1 + vprotd $19, %xmm7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm7, %xmm3 vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - -sha256d_ms_4way_avx_extend_loop2: - sha256_avx_extend_doubleround 16 - sha256_avx_extend_doubleround 18 - sha256_avx_extend_doubleround 20 - sha256_avx_extend_doubleround 22 - sha256_avx_extend_doubleround 24 - sha256_avx_extend_doubleround 26 - sha256_avx_extend_doubleround 28 - sha256_avx_extend_doubleround 30 - sha256_avx_extend_doubleround 32 - sha256_avx_extend_doubleround 34 - sha256_avx_extend_doubleround 36 - sha256_avx_extend_doubleround 38 - sha256_avx_extend_doubleround 40 - sha256_avx_extend_doubleround 42 - jz sha256d_ms_4way_avx_extend_coda2 - sha256_avx_extend_doubleround 44 - sha256_avx_extend_doubleround 46 - - movdqa 0(%rcx), %xmm7 - movdqa 16(%rcx), %xmm8 - movdqa 32(%rcx), %xmm9 - movdqa 48(%rcx), %xmm10 - movdqa 64(%rcx), %xmm0 - movdqa 80(%rcx), %xmm5 - movdqa 96(%rcx), %xmm4 - movdqa 112(%rcx), %xmm3 - - movq %rsi, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_avx_main_loop1 - -sha256d_ms_4way_avx_main_loop2: - sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 -sha256d_ms_4way_avx_main_loop1: - sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - jz sha256d_ms_4way_avx_finish - sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 60 + vpaddd %xmm6, %xmm3, %xmm3 + vpaddd 16*(60+1)(%rax), %xmm9, %xmm6 + vpaddd 16*(60+1)(%rcx), %xmm6, %xmm6 + + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + + vpand %xmm7, %xmm5, %xmm2 + vpand %xmm3, %xmm5, %xmm4 + vpand %xmm3, %xmm7, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm3, %xmm1 + vprotd $19, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm3, %xmm4 + vpxor %xmm2, %xmm4, %xmm4 + vpaddd %xmm6, %xmm4, %xmm4 + vpaddd 16*(60+2)(%rax), %xmm8, %xmm6 + vpaddd 16*(60+2)(%rcx), %xmm6, %xmm6 + + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + + vpand %xmm3, %xmm7, %xmm2 + vpand %xmm4, %xmm7, %xmm5 + vpand %xmm4, %xmm3, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm4, %xmm1 + vprotd $19, %xmm4, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm4, %xmm5 + vpxor %xmm2, %xmm5, %xmm5 + vpaddd %xmm6, %xmm5, %xmm5 + vpaddd 16*(60+3)(%rax), %xmm0, %xmm6 + vpaddd 16*(60+3)(%rcx), %xmm6, %xmm6 + + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + + vpand %xmm4, %xmm3, %xmm2 + vpand %xmm5, %xmm3, %xmm7 + vpand %xmm5, %xmm4, %xmm1 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, %xmm5, %xmm1 + vprotd $19, %xmm5, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, %xmm5, %xmm7 + vpxor %xmm2, %xmm7, %xmm7 + vpaddd %xmm6, %xmm7, %xmm7 movdqa 2*16(%rsp), %xmm1 movdqa 3*16(%rsp), %xmm2 @@ -2892,22 +25202,14 @@ sha256d_ms_4way_avx_main_loop1: vmovdqa -15*16(%rax), %xmm0 vmovdqa -14*16(%rax), %xmm4 - vpslld $14, %xmm0, %xmm2 - vpslld $14, %xmm4, %xmm6 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 vpsrld $3, %xmm0, %xmm8 vpsrld $3, %xmm4, %xmm4 - vpsrld $7, %xmm0, %xmm1 - vpsrld $4, %xmm4, %xmm5 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpsrld $11, %xmm1, %xmm1 - vpsrld $11, %xmm5, %xmm5 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpslld $11, %xmm2, %xmm2 - vpslld $11, %xmm6, %xmm6 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 vpxor %xmm2, %xmm8, %xmm8 vpxor %xmm6, %xmm4, %xmm4 vpaddd %xmm0, %xmm4, %xmm4 @@ -2916,39 +25218,95 @@ sha256d_ms_4way_avx_main_loop1: vmovdqa %xmm3, 0*16(%rax) vmovdqa %xmm7, 1*16(%rax) - sha256_avx_extend_doubleround 2 - sha256_avx_extend_doubleround 4 + vmovdqa (2-15)*16(%rax), %xmm0 + vmovdqa (2-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (2-16)*16(%rax), %xmm0, %xmm0 + vpaddd (2-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (2-7)*16(%rax), %xmm0, %xmm0 + vpaddd (2-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, (2+1)*16(%rax) + vmovdqa (4-15)*16(%rax), %xmm0 + vmovdqa (4-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (4-16)*16(%rax), %xmm0, %xmm0 + vpaddd (4-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (4-7)*16(%rax), %xmm0, %xmm0 + vpaddd (4-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, (4+1)*16(%rax) vmovdqa -9*16(%rax), %xmm0 - vpslld $14, %xmm0, %xmm2 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 vpsrld $3, %xmm0, %xmm8 - vpsrld $7, %xmm0, %xmm1 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm2, %xmm8, %xmm8 - vpsrld $11, %xmm1, %xmm1 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm1, %xmm2, %xmm2 vpxor %xmm2, %xmm8, %xmm8 vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 vpaddd -10*16(%rax), %xmm8, %xmm0 - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 vpaddd -1*16(%rax), %xmm0, %xmm0 vpaddd 0*16(%rax), %xmm4, %xmm4 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 vpxor %xmm2, %xmm3, %xmm3 vpxor %xmm6, %xmm7, %xmm7 vpaddd %xmm0, %xmm3, %xmm3 @@ -2956,22 +25314,14 @@ sha256d_ms_4way_avx_main_loop1: vmovdqa %xmm3, 6*16(%rax) vmovdqa %xmm7, 7*16(%rax) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 vpsrld $10, %xmm3, %xmm3 vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 vpxor %xmm2, %xmm3, %xmm3 vpxor %xmm6, %xmm7, %xmm7 vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 @@ -2980,45 +25330,29 @@ sha256d_ms_4way_avx_main_loop1: vmovdqa %xmm3, 8*16(%rax) vmovdqa %xmm7, 9*16(%rax) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 + vpsrld $10, %xmm7, %xmm7 vpxor %xmm2, %xmm3, %xmm3 vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 vpxor %xmm2, %xmm3, %xmm3 vpxor %xmm6, %xmm7, %xmm7 vpaddd 5*16(%rax), %xmm3, %xmm3 @@ -3026,867 +25360,3872 @@ sha256d_ms_4way_avx_main_loop1: vmovdqa %xmm3, 12*16(%rax) vmovdqa %xmm7, 13*16(%rax) - vmovdqa sha256d_4preext2_30(%rip), %xmm0 - vmovdqa 0*16(%rax), %xmm4 - vpslld $14, %xmm4, %xmm6 - vpsrld $3, %xmm4, %xmm4 - vpsrld $4, %xmm4, %xmm5 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm6, %xmm4, %xmm4 - vpsrld $11, %xmm5, %xmm5 - vpslld $11, %xmm6, %xmm6 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd -1*16(%rax), %xmm4, %xmm4 - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_xop_extend_loop2 + +sha256d_ms_4way_xop_extend_coda2: + vmovdqa (44-15)*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + + vpaddd (44-16)*16(%rax), %xmm0, %xmm0 + vpaddd (44-7)*16(%rax), %xmm0, %xmm0 + + vprotd $15, %xmm3, %xmm1 + vprotd $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 44*16(%rax) + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop2 + + +sha256d_ms_4way_xop_finish: + vpaddd 16*57(%rax), %xmm9, %xmm6 + vpaddd 16*57(%rcx), %xmm6, %xmm6 + vpandn %xmm8, %xmm10, %xmm1 + vpand %xmm10, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, %xmm10, %xmm1 + vprotd $21, %xmm10, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm10, %xmm9 + vpxor %xmm2, %xmm9, %xmm9 + vpaddd %xmm9, %xmm6, %xmm6 + vpaddd %xmm6, %xmm4, %xmm9 + vpaddd 16*58(%rax), %xmm8, %xmm6 + vpaddd 16*58(%rcx), %xmm6, %xmm6 + vpandn %xmm0, %xmm9, %xmm1 + vpand %xmm9, %xmm10, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, %xmm9, %xmm1 + vprotd $21, %xmm9, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm9, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd %xmm8, %xmm6, %xmm6 + vpaddd %xmm6, %xmm5, %xmm8 + vpaddd 16*59(%rax), %xmm0, %xmm6 + vpaddd 16*59(%rcx), %xmm6, %xmm6 + vpandn %xmm10, %xmm8, %xmm1 + vpand %xmm8, %xmm9, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, %xmm8, %xmm1 + vprotd $21, %xmm8, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm8, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm6, %xmm6 + vpaddd %xmm6, %xmm7, %xmm0 + vpaddd 16*60(%rax), %xmm10, %xmm6 + vpaddd 16*60(%rcx), %xmm6, %xmm6 + vpandn %xmm9, %xmm0, %xmm1 + vpand %xmm0, %xmm8, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, %xmm0, %xmm1 + vprotd $21, %xmm0, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, %xmm0, %xmm10 + vpxor %xmm2, %xmm10, %xmm10 + vpaddd %xmm10, %xmm6, %xmm6 + vpaddd %xmm6, %xmm3, %xmm10 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_XOP */ + + + .text + .p2align 6 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushq %rbx + pushq %rcx + pushq %rdx + + /* Check for VIA PadLock Hash Engine */ + movl $0xc0000000, %eax + cpuid + cmpl $0xc0000001, %eax + jb sha256_use_4way_no_phe + movl $0xc0000001, %eax + cpuid + andl $0x00000c00, %edx + cmpl $0x00000c00, %edx + jne sha256_use_4way_no_phe + leaq sha256_transform_phe(%rip), %rdx + movq %rdx, sha256_transform_addr(%rip) + xorl %eax, %eax + jmp sha256_use_4way_exit +sha256_use_4way_no_phe: +#if defined(USE_AVX) + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_4way_base + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_4way_base +#if defined(USE_XOP) + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jz sha256_use_4way_avx + +sha256_use_4way_xop: + leaq sha256d_ms_4way_xop(%rip), %rcx + leaq sha256_transform_4way_core_xop(%rip), %rdx + jmp sha256_use_4way_done +#endif /* USE_XOP */ + +sha256_use_4way_avx: + leaq sha256d_ms_4way_avx(%rip), %rcx + leaq sha256_transform_4way_core_avx(%rip), %rdx + jmp sha256_use_4way_done +#endif /* USE_AVX */ + +sha256_use_4way_base: + leaq sha256d_ms_4way_sse2(%rip), %rcx + leaq sha256_transform_4way_core_sse2(%rip), %rdx + +sha256_use_4way_done: + movq %rcx, sha256d_ms_4way_addr(%rip) + movq %rdx, sha256_transform_4way_core_addr(%rip) + movl $1, %eax +sha256_use_4way_exit: + popq %rdx + popq %rcx + popq %rbx + ret + + +#if defined(USE_AVX2) + + .text + .p2align 6 + .globl sha256d_ms_8way + .globl _sha256d_ms_8way +sha256d_ms_8way: +_sha256d_ms_8way: +sha256d_ms_8way_avx2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + pushq %rbp + movq %rsp, %rbp + subq $64*32, %rsp + andq $-128, %rsp + + leaq 16*32(%rsi), %rax + +sha256d_ms_8way_avx2_extend_loop1: + vmovdqa 3*32(%rsi), %ymm0 + vmovdqa 2*32(%rax), %ymm3 + vmovdqa 3*32(%rax), %ymm7 + vmovdqa %ymm3, 2*32(%rsp) + vmovdqa %ymm7, 3*32(%rsp) + vpaddd %ymm0, %ymm7, %ymm7 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, 3*32(%rax) + + vmovdqa 4*32(%rax), %ymm0 + vmovdqa %ymm0, 4*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, 5*32(%rax) + + vmovdqa 6*32(%rax), %ymm0 + vmovdqa 7*32(%rax), %ymm4 + vmovdqa %ymm0, 6*32(%rsp) + vmovdqa %ymm4, 7*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vmovdqa 8*32(%rax), %ymm0 + vmovdqa 2*32(%rax), %ymm4 + vmovdqa %ymm0, 8*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) - jmp sha256d_ms_4way_avx_extend_loop2 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) -sha256d_ms_4way_avx_extend_coda2: - sha256_avx_extend_round 44 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) - movdqa sha256_4h+0(%rip), %xmm7 - movdqa sha256_4h+16(%rip), %xmm5 - movdqa sha256_4h+32(%rip), %xmm4 - movdqa sha256_4h+48(%rip), %xmm3 - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm8 - movdqa sha256_4h+96(%rip), %xmm9 - movdqa sha256_4h+112(%rip), %xmm10 + vmovdqa 14*32(%rax), %ymm0 + vmovdqa 15*32(%rax), %ymm4 + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm4, 15*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_avx_main_loop2 +sha256d_ms_8way_avx2_extend_loop2: + vmovdqa (16-15)*32(%rax), %ymm0 + vmovdqa (16-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (16-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (16-7)*32(%rax), %ymm0, %ymm0 + vpaddd (16-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 16*32(%rax) + vmovdqa %ymm7, (16+1)*32(%rax) + vmovdqa (18-15)*32(%rax), %ymm0 + vmovdqa (18-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (18-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (18-7)*32(%rax), %ymm0, %ymm0 + vpaddd (18-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 18*32(%rax) + vmovdqa %ymm7, (18+1)*32(%rax) + vmovdqa (20-15)*32(%rax), %ymm0 + vmovdqa (20-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (20-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (20-7)*32(%rax), %ymm0, %ymm0 + vpaddd (20-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 20*32(%rax) + vmovdqa %ymm7, (20+1)*32(%rax) + vmovdqa (22-15)*32(%rax), %ymm0 + vmovdqa (22-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (22-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (22-7)*32(%rax), %ymm0, %ymm0 + vpaddd (22-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 22*32(%rax) + vmovdqa %ymm7, (22+1)*32(%rax) + vmovdqa (24-15)*32(%rax), %ymm0 + vmovdqa (24-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (24-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (24-7)*32(%rax), %ymm0, %ymm0 + vpaddd (24-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 24*32(%rax) + vmovdqa %ymm7, (24+1)*32(%rax) + vmovdqa (26-15)*32(%rax), %ymm0 + vmovdqa (26-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (26-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (26-7)*32(%rax), %ymm0, %ymm0 + vpaddd (26-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 26*32(%rax) + vmovdqa %ymm7, (26+1)*32(%rax) + vmovdqa (28-15)*32(%rax), %ymm0 + vmovdqa (28-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (28-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (28-7)*32(%rax), %ymm0, %ymm0 + vpaddd (28-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 28*32(%rax) + vmovdqa %ymm7, (28+1)*32(%rax) + vmovdqa (30-15)*32(%rax), %ymm0 + vmovdqa (30-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (30-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (30-7)*32(%rax), %ymm0, %ymm0 + vpaddd (30-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 30*32(%rax) + vmovdqa %ymm7, (30+1)*32(%rax) + vmovdqa (32-15)*32(%rax), %ymm0 + vmovdqa (32-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (32-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (32-7)*32(%rax), %ymm0, %ymm0 + vpaddd (32-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 32*32(%rax) + vmovdqa %ymm7, (32+1)*32(%rax) + vmovdqa (34-15)*32(%rax), %ymm0 + vmovdqa (34-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (34-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (34-7)*32(%rax), %ymm0, %ymm0 + vpaddd (34-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 34*32(%rax) + vmovdqa %ymm7, (34+1)*32(%rax) + vmovdqa (36-15)*32(%rax), %ymm0 + vmovdqa (36-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (36-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (36-7)*32(%rax), %ymm0, %ymm0 + vpaddd (36-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 36*32(%rax) + vmovdqa %ymm7, (36+1)*32(%rax) + vmovdqa (38-15)*32(%rax), %ymm0 + vmovdqa (38-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (38-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (38-7)*32(%rax), %ymm0, %ymm0 + vpaddd (38-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 38*32(%rax) + vmovdqa %ymm7, (38+1)*32(%rax) + vmovdqa (40-15)*32(%rax), %ymm0 + vmovdqa (40-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (40-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (40-7)*32(%rax), %ymm0, %ymm0 + vpaddd (40-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 40*32(%rax) + vmovdqa %ymm7, (40+1)*32(%rax) + vmovdqa (42-15)*32(%rax), %ymm0 + vmovdqa (42-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (42-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 -.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 16*\i(%rax), \r0, %xmm6 - vpaddd 16*\i(%rcx), %xmm6, %xmm6 - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - vpslld $7, \r3, %xmm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $14, %xmm1, %xmm1 - vpsrld $14, %xmm2, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $5, %xmm1, %xmm1 - vpxor %xmm1, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 -.endm + vpaddd (42-7)*32(%rax), %ymm0, %ymm0 + vpaddd (42-6)*32(%rax), %ymm4, %ymm4 -sha256d_ms_4way_avx_finish: - sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 - sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 - sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 - sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 - - paddd sha256_4h+112(%rip), %xmm10 - movdqa %xmm10, 112(%rdi) - - addq $1032, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm9 - movdqa 64(%rsp), %xmm10 - addq $80, %rsp - popq %rdi -#endif - ret - -#endif /* USE_AVX */ - - -#if defined(USE_XOP) - - .p2align 6 -sha256d_ms_4way_xop: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - subq $80, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm9, 48(%rsp) - movdqa %xmm10, 64(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - subq $1032, %rsp - - leaq 256(%rsi), %rax - -sha256d_ms_4way_xop_extend_loop1: - vmovdqa 3*16(%rsi), %xmm0 - vmovdqa 2*16(%rax), %xmm3 - vmovdqa 3*16(%rax), %xmm7 - vmovdqa %xmm3, 2*16(%rsp) - vmovdqa %xmm7, 3*16(%rsp) - vpaddd %xmm0, %xmm7, %xmm7 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm0, %xmm0 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 2*16(%rax) - vmovdqa %xmm7, 3*16(%rax) - - vmovdqa 4*16(%rax), %xmm0 - vmovdqa %xmm0, 4*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 4*16(%rax) - vmovdqa %xmm7, 5*16(%rax) - - vmovdqa 6*16(%rax), %xmm0 - vmovdqa 7*16(%rax), %xmm4 - vmovdqa %xmm0, 6*16(%rsp) - vmovdqa %xmm4, 7*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vmovdqa 8*16(%rax), %xmm0 - vmovdqa 2*16(%rax), %xmm4 - vmovdqa %xmm0, 8*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa 14*16(%rax), %xmm0 - vmovdqa 15*16(%rax), %xmm4 - vmovdqa %xmm0, 14*16(%rsp) - vmovdqa %xmm4, 15*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - -sha256d_ms_4way_xop_extend_loop2: - sha256_xop_extend_doubleround 16 - sha256_xop_extend_doubleround 18 - sha256_xop_extend_doubleround 20 - sha256_xop_extend_doubleround 22 - sha256_xop_extend_doubleround 24 - sha256_xop_extend_doubleround 26 - sha256_xop_extend_doubleround 28 - sha256_xop_extend_doubleround 30 - sha256_xop_extend_doubleround 32 - sha256_xop_extend_doubleround 34 - sha256_xop_extend_doubleround 36 - sha256_xop_extend_doubleround 38 - sha256_xop_extend_doubleround 40 - sha256_xop_extend_doubleround 42 - jz sha256d_ms_4way_xop_extend_coda2 - sha256_xop_extend_doubleround 44 - sha256_xop_extend_doubleround 46 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 42*32(%rax) + vmovdqa %ymm7, (42+1)*32(%rax) + jz sha256d_ms_8way_avx2_extend_coda2 + vmovdqa (44-15)*32(%rax), %ymm0 + vmovdqa (44-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (44-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (44-7)*32(%rax), %ymm0, %ymm0 + vpaddd (44-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 44*32(%rax) + vmovdqa %ymm7, (44+1)*32(%rax) + vmovdqa (46-15)*32(%rax), %ymm0 + vmovdqa (46-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (46-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (46-7)*32(%rax), %ymm0, %ymm0 + vpaddd (46-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 46*32(%rax) + vmovdqa %ymm7, (46+1)*32(%rax) - movdqa 0(%rcx), %xmm7 - movdqa 16(%rcx), %xmm8 - movdqa 32(%rcx), %xmm9 - movdqa 48(%rcx), %xmm10 - movdqa 64(%rcx), %xmm0 - movdqa 80(%rcx), %xmm5 - movdqa 96(%rcx), %xmm4 - movdqa 112(%rcx), %xmm3 + vmovdqa 0(%rcx), %ymm7 + vmovdqa 32(%rcx), %ymm8 + vmovdqa 64(%rcx), %ymm9 + vmovdqa 96(%rcx), %ymm10 + vmovdqa 128(%rcx), %ymm0 + vmovdqa 160(%rcx), %ymm5 + vmovdqa 192(%rcx), %ymm4 + vmovdqa 224(%rcx), %ymm3 movq %rsi, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_xop_main_loop1 - -sha256d_ms_4way_xop_main_loop2: - sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 -sha256d_ms_4way_xop_main_loop1: - sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - jz sha256d_ms_4way_xop_finish - sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 60 - - movdqa 2*16(%rsp), %xmm1 - movdqa 3*16(%rsp), %xmm2 - movdqa 4*16(%rsp), %xmm6 - movdqa %xmm1, 18*16(%rsi) - movdqa %xmm2, 19*16(%rsi) - movdqa %xmm6, 20*16(%rsi) - movdqa 6*16(%rsp), %xmm1 - movdqa 7*16(%rsp), %xmm2 - movdqa 8*16(%rsp), %xmm6 - movdqa %xmm1, 22*16(%rsi) - movdqa %xmm2, 23*16(%rsi) - movdqa %xmm6, 24*16(%rsi) - movdqa 14*16(%rsp), %xmm1 - movdqa 15*16(%rsp), %xmm2 - movdqa %xmm1, 30*16(%rsi) - movdqa %xmm2, 31*16(%rsi) - - paddd 0(%rdx), %xmm7 - paddd 16(%rdx), %xmm5 - paddd 32(%rdx), %xmm4 - paddd 48(%rdx), %xmm3 - paddd 64(%rdx), %xmm0 - paddd 80(%rdx), %xmm8 - paddd 96(%rdx), %xmm9 - paddd 112(%rdx), %xmm10 - - movdqa %xmm7, 0(%rsp) - movdqa %xmm5, 16(%rsp) - movdqa %xmm4, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm0, 64(%rsp) - movdqa %xmm8, 80(%rsp) - movdqa %xmm9, 96(%rsp) - movdqa %xmm10, 112(%rsp) - - pxor %xmm0, %xmm0 - movq $0x8000000000000100, %rax - movd %rax, %xmm1 - pshufd $0x55, %xmm1, %xmm2 - pshufd $0x00, %xmm1, %xmm1 - movdqa %xmm2, 128(%rsp) - movdqa %xmm0, 144(%rsp) - movdqa %xmm0, 160(%rsp) - movdqa %xmm0, 176(%rsp) - movdqa %xmm0, 192(%rsp) - movdqa %xmm0, 208(%rsp) - movdqa %xmm0, 224(%rsp) - movdqa %xmm1, 240(%rsp) - - leaq 256(%rsp), %rax - cmpq %rax, %rax - - vmovdqa -15*16(%rax), %xmm0 - vmovdqa -14*16(%rax), %xmm4 - vprotd $25, %xmm0, %xmm1 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm0, %xmm2 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm0, %xmm8 - vpsrld $3, %xmm4, %xmm4 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd %xmm0, %xmm4, %xmm4 - vpaddd -16*16(%rax), %xmm8, %xmm3 - vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 - vmovdqa %xmm3, 0*16(%rax) - vmovdqa %xmm7, 1*16(%rax) - - sha256_xop_extend_doubleround 2 - sha256_xop_extend_doubleround 4 - - vmovdqa -9*16(%rax), %xmm0 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm8 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm8, %xmm8 - vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 - vpaddd -10*16(%rax), %xmm8, %xmm0 - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpaddd -1*16(%rax), %xmm0, %xmm0 - vpaddd 0*16(%rax), %xmm4, %xmm4 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 - vpaddd 1*16(%rax), %xmm3, %xmm3 - vpaddd 2*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa sha256d_4preext2_30(%rip), %xmm0 - vmovdqa 0*16(%rax), %xmm4 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm4, %xmm4 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd -1*16(%rax), %xmm4, %xmm4 - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - - jmp sha256d_ms_4way_xop_extend_loop2 - -sha256d_ms_4way_xop_extend_coda2: - sha256_xop_extend_round 44 - - movdqa sha256_4h+0(%rip), %xmm7 - movdqa sha256_4h+16(%rip), %xmm5 - movdqa sha256_4h+32(%rip), %xmm4 - movdqa sha256_4h+48(%rip), %xmm3 - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm8 - movdqa sha256_4h+96(%rip), %xmm9 - movdqa sha256_4h+112(%rip), %xmm10 + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop1 - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_xop_main_loop2 +sha256d_ms_8way_avx2_main_loop2: + vpaddd 32*(0)(%rax), %ymm10, %ymm6 + vpaddd 32*(0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(1)(%rax), %ymm9, %ymm6 + vpaddd 32*(1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(2)(%rax), %ymm8, %ymm6 + vpaddd 32*(2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 +sha256d_ms_8way_avx2_main_loop1: + vpaddd 32*(3)(%rax), %ymm0, %ymm6 + vpaddd 32*(3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(4+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(4+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(4+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(4+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(4+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(4+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(4+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(4+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(8+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(8+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(8+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(8+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(8+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(8+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(8+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(8+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(12+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(12+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(12+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(12+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(12+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(12+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(12+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(12+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(16+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(16+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(16+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(16+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(16+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(16+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(16+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(16+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(20+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(20+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(20+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(20+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(20+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(20+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(20+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(20+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(24+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(24+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(24+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(24+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(24+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(24+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(24+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(24+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(28+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(28+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(28+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(28+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(28+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(28+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(28+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(28+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(32+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(32+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(32+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(32+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(32+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(32+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(32+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(32+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(36+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(36+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(36+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(36+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(36+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(36+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(36+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(36+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(40+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(40+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(40+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(40+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(40+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(40+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(40+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(40+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(44+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(44+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 -.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 16*\i(%rax), \r0, %xmm6 - vpaddd 16*\i(%rcx), %xmm6, %xmm6 - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - vprotd $26, \r3, %xmm1 - vprotd $21, \r3, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $7, \r3, \r0 - vpxor %xmm2, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 -.endm + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 -sha256d_ms_4way_xop_finish: - sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 - sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 - sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 - sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 - - paddd sha256_4h+112(%rip), %xmm10 - movdqa %xmm10, 112(%rdi) - - addq $1032, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm9 - movdqa 64(%rsp), %xmm10 - addq $80, %rsp - popq %rdi -#endif - ret - -#endif /* USE_XOP */ + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(44+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(44+1)(%rcx), %ymm6, %ymm6 - .text - .p2align 6 - .globl sha256_use_4way - .globl _sha256_use_4way -sha256_use_4way: -_sha256_use_4way: - pushq %rbx - pushq %rcx - pushq %rdx - - /* Check for VIA PadLock Hash Engine */ - movl $0xc0000000, %eax - cpuid - cmpl $0xc0000001, %eax - jb sha256_use_4way_no_phe - movl $0xc0000001, %eax - cpuid - andl $0x00000c00, %edx - cmpl $0x00000c00, %edx - jne sha256_use_4way_no_phe - leaq sha256_transform_phe(%rip), %rdx - movq %rdx, sha256_transform_addr(%rip) - xorl %eax, %eax - jmp sha256_use_4way_exit -sha256_use_4way_no_phe: -#if defined(USE_AVX) - /* Check for AVX and OSXSAVE support */ - movl $1, %eax - cpuid - andl $0x18000000, %ecx - cmpl $0x18000000, %ecx - jne sha256_use_4way_base - /* Check for XMM and YMM state support */ - xorl %ecx, %ecx - xgetbv - andl $0x00000006, %eax - cmpl $0x00000006, %eax - jne sha256_use_4way_base -#if defined(USE_XOP) - /* Check for XOP support */ - movl $0x80000001, %eax - cpuid - andl $0x00000800, %ecx - jz sha256_use_4way_avx - -sha256_use_4way_xop: - leaq sha256d_ms_4way_xop(%rip), %rcx - leaq sha256_transform_4way_core_xop(%rip), %rdx - jmp sha256_use_4way_done -#endif /* USE_XOP */ - -sha256_use_4way_avx: - leaq sha256d_ms_4way_avx(%rip), %rcx - leaq sha256_transform_4way_core_avx(%rip), %rdx - jmp sha256_use_4way_done -#endif /* USE_AVX */ - -sha256_use_4way_base: - leaq sha256d_ms_4way_sse2(%rip), %rcx - leaq sha256_transform_4way_core_sse2(%rip), %rdx - -sha256_use_4way_done: - movq %rcx, sha256d_ms_4way_addr(%rip) - movq %rdx, sha256_transform_4way_core_addr(%rip) - movl $1, %eax -sha256_use_4way_exit: - popq %rdx - popq %rcx - popq %rbx - ret + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 -#if defined(USE_AVX2) + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 - .text - .p2align 6 - .globl sha256d_ms_8way - .globl _sha256d_ms_8way -sha256d_ms_8way: -_sha256d_ms_8way: -sha256d_ms_8way_avx2: -#if defined(_WIN64) || defined(__CYGWIN__) - pushq %rdi - subq $80, %rsp - vmovdqa %xmm6, 0(%rsp) - vmovdqa %xmm7, 16(%rsp) - vmovdqa %xmm8, 32(%rsp) - vmovdqa %xmm9, 48(%rsp) - vmovdqa %xmm10, 64(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - pushq %rbp - movq %rsp, %rbp - subq $64*32, %rsp - andq $-128, %rsp - - leaq 16*32(%rsi), %rax - -sha256d_ms_8way_avx2_extend_loop1: - vmovdqa 3*32(%rsi), %ymm0 - vmovdqa 2*32(%rax), %ymm3 - vmovdqa 3*32(%rax), %ymm7 - vmovdqa %ymm3, 2*32(%rsp) - vmovdqa %ymm7, 3*32(%rsp) - vpaddd %ymm0, %ymm7, %ymm7 - vpslld $14, %ymm0, %ymm2 - vpsrld $3, %ymm0, %ymm0 - vpsrld $4, %ymm0, %ymm1 + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(44+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(44+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(44+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(44+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(48+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(48+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm1, %ymm3, %ymm3 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(48+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(48+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(48+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(48+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(48+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(48+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 vpxor %ymm1, %ymm0, %ymm0 vpxor %ymm2, %ymm0, %ymm0 - vpsrld $11, %ymm1, %ymm1 - vpslld $11, %ymm2, %ymm2 + vpslld $5, %ymm1, %ymm1 vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, 2*32(%rax) - vmovdqa %ymm7, 3*32(%rax) - - vmovdqa 4*32(%rax), %ymm0 - vmovdqa %ymm0, 4*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, 4*32(%rax) - vmovdqa %ymm7, 5*32(%rax) - - vmovdqa 6*32(%rax), %ymm0 - vmovdqa 7*32(%rax), %ymm4 - vmovdqa %ymm0, 6*32(%rsp) - vmovdqa %ymm4, 7*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(52+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(52+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 6*32(%rax) - vmovdqa %ymm7, 7*32(%rax) - - vmovdqa 8*32(%rax), %ymm0 - vmovdqa 2*32(%rax), %ymm4 - vmovdqa %ymm0, 8*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 8*32(%rax) - vmovdqa %ymm7, 9*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(52+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(52+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(52+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(52+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(52+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(52+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(56)(%rax), %ymm10, %ymm6 + vpaddd 32*(56)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd 3*32(%rax), %ymm3, %ymm3 - vpaddd 4*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 10*32(%rax) - vmovdqa %ymm7, 11*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 + vpaddd %ymm6, %ymm3, %ymm3 + jz sha256d_ms_8way_avx2_finish + vpaddd 32*(57)(%rax), %ymm9, %ymm6 + vpaddd 32*(57)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(58)(%rax), %ymm8, %ymm6 + vpaddd 32*(58)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(59)(%rax), %ymm0, %ymm6 + vpaddd 32*(59)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 + vpaddd 32*(60+0)(%rax), %ymm10, %ymm6 + vpaddd 32*(60+0)(%rcx), %ymm6, %ymm6 + + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 + + vpand %ymm5, %ymm4, %ymm2 + vpand %ymm7, %ymm4, %ymm3 + vpand %ymm7, %ymm5, %ymm1 + vpxor %ymm3, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm7, %ymm2 + vpsrld $2, %ymm7, %ymm3 + vpsrld $11, %ymm3, %ymm1 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd 5*32(%rax), %ymm3, %ymm3 - vpaddd 6*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 12*32(%rax) - vmovdqa %ymm7, 13*32(%rax) - - vmovdqa 14*32(%rax), %ymm0 - vmovdqa 15*32(%rax), %ymm4 - vmovdqa %ymm0, 14*32(%rsp) - vmovdqa %ymm4, 15*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpaddd 7*32(%rax), %ymm0, %ymm0 - vpaddd 8*32(%rax), %ymm4, %ymm4 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 14*32(%rax) - vmovdqa %ymm7, 15*32(%rax) - -sha256d_ms_8way_avx2_extend_loop2: - sha256_avx2_extend_doubleround 16 - sha256_avx2_extend_doubleround 18 - sha256_avx2_extend_doubleround 20 - sha256_avx2_extend_doubleround 22 - sha256_avx2_extend_doubleround 24 - sha256_avx2_extend_doubleround 26 - sha256_avx2_extend_doubleround 28 - sha256_avx2_extend_doubleround 30 - sha256_avx2_extend_doubleround 32 - sha256_avx2_extend_doubleround 34 - sha256_avx2_extend_doubleround 36 - sha256_avx2_extend_doubleround 38 - sha256_avx2_extend_doubleround 40 - sha256_avx2_extend_doubleround 42 - jz sha256d_ms_8way_avx2_extend_coda2 - sha256_avx2_extend_doubleround 44 - sha256_avx2_extend_doubleround 46 - - vmovdqa 0(%rcx), %ymm7 - vmovdqa 32(%rcx), %ymm8 - vmovdqa 64(%rcx), %ymm9 - vmovdqa 96(%rcx), %ymm10 - vmovdqa 128(%rcx), %ymm0 - vmovdqa 160(%rcx), %ymm5 - vmovdqa 192(%rcx), %ymm4 - vmovdqa 224(%rcx), %ymm3 - - movq %rsi, %rax - leaq sha256_8k(%rip), %rcx - jmp sha256d_ms_8way_avx2_main_loop1 - -sha256d_ms_8way_avx2_main_loop2: - sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 -sha256d_ms_8way_avx2_main_loop1: - sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 - sha256_avx2_main_quadround 4 - sha256_avx2_main_quadround 8 - sha256_avx2_main_quadround 12 - sha256_avx2_main_quadround 16 - sha256_avx2_main_quadround 20 - sha256_avx2_main_quadround 24 - sha256_avx2_main_quadround 28 - sha256_avx2_main_quadround 32 - sha256_avx2_main_quadround 36 - sha256_avx2_main_quadround 40 - sha256_avx2_main_quadround 44 - sha256_avx2_main_quadround 48 - sha256_avx2_main_quadround 52 - sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - jz sha256d_ms_8way_avx2_finish - sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 - sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 - sha256_avx2_main_quadround 60 + vpaddd %ymm6, %ymm3, %ymm3 + vpaddd 32*(60+1)(%rax), %ymm9, %ymm6 + vpaddd 32*(60+1)(%rcx), %ymm6, %ymm6 + + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + + vpand %ymm7, %ymm5, %ymm2 + vpand %ymm3, %ymm5, %ymm4 + vpand %ymm3, %ymm7, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm3, %ymm2 + vpsrld $2, %ymm3, %ymm4 + vpsrld $11, %ymm4, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm4, %ymm4 + vpxor %ymm1, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm4, %ymm4 + vpaddd %ymm6, %ymm4, %ymm4 + vpaddd 32*(60+2)(%rax), %ymm8, %ymm6 + vpaddd 32*(60+2)(%rcx), %ymm6, %ymm6 + + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + + vpand %ymm3, %ymm7, %ymm2 + vpand %ymm4, %ymm7, %ymm5 + vpand %ymm4, %ymm3, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm4, %ymm2 + vpsrld $2, %ymm4, %ymm5 + vpsrld $11, %ymm5, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm5, %ymm5 + vpxor %ymm1, %ymm5, %ymm5 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm5, %ymm5 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd 32*(60+3)(%rax), %ymm0, %ymm6 + vpaddd 32*(60+3)(%rcx), %ymm6, %ymm6 + + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + + vpand %ymm4, %ymm3, %ymm2 + vpand %ymm5, %ymm3, %ymm7 + vpand %ymm5, %ymm4, %ymm1 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, %ymm5, %ymm2 + vpsrld $2, %ymm5, %ymm7 + vpsrld $11, %ymm7, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, %ymm7, %ymm7 + vpxor %ymm1, %ymm7, %ymm7 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm7, %ymm7 vmovdqa 2*32(%rsp), %ymm1 vmovdqa 3*32(%rsp), %ymm2 @@ -3967,8 +29306,108 @@ sha256d_ms_8way_avx2_main_loop1: vmovdqa %ymm3, 0*32(%rax) vmovdqa %ymm7, 1*32(%rax) - sha256_avx2_extend_doubleround 2 - sha256_avx2_extend_doubleround 4 + vmovdqa (2-15)*32(%rax), %ymm0 + vmovdqa (2-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (2-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (2-7)*32(%rax), %ymm0, %ymm0 + vpaddd (2-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, (2+1)*32(%rax) + vmovdqa (4-15)*32(%rax), %ymm0 + vmovdqa (4-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (4-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (4-7)*32(%rax), %ymm0, %ymm0 + vpaddd (4-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, (4+1)*32(%rax) vmovdqa -9*32(%rax), %ymm0 vpslld $14, %ymm0, %ymm2 @@ -4117,7 +29556,30 @@ sha256d_ms_8way_avx2_main_loop1: jmp sha256d_ms_8way_avx2_extend_loop2 sha256d_ms_8way_avx2_extend_coda2: - sha256_avx2_extend_round 44 + vmovdqa (44-15)*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd (44-16)*32(%rax), %ymm0, %ymm0 + vpaddd (44-7)*32(%rax), %ymm0, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpsrld $10, %ymm3, %ymm3 + vpsrld $7, %ymm3, %ymm1 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpsrld $2, %ymm1, %ymm1 + vpslld $2, %ymm2, %ymm2 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 44*32(%rax) vmovdqa sha256_8h+0(%rip), %ymm7 vmovdqa sha256_8h+32(%rip), %ymm5 @@ -4132,33 +29594,84 @@ sha256d_ms_8way_avx2_extend_coda2: leaq sha256_8k(%rip), %rcx jmp sha256d_ms_8way_avx2_main_loop2 -.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 32*\i(%rax), \r0, %ymm6 - vpaddd 32*\i(%rcx), %ymm6, %ymm6 - vpandn \r1, \r3, %ymm1 - vpand \r3, \r2, %ymm2 + +sha256d_ms_8way_avx2_finish: + vpaddd 32*57(%rax), %ymm9, %ymm6 + vpaddd 32*57(%rcx), %ymm6, %ymm6 + vpandn %ymm8, %ymm10, %ymm1 + vpand %ymm10, %ymm0, %ymm2 vpxor %ymm2, %ymm1, %ymm1 vpaddd %ymm1, %ymm6, %ymm6 - vpslld $7, \r3, %ymm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 + vpslld $7, %ymm10, %ymm1 + vpsrld $6, %ymm10, %ymm9 + vpsrld $5, %ymm9, %ymm2 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 vpslld $14, %ymm1, %ymm1 vpsrld $14, %ymm2, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 + vpxor %ymm1, %ymm9, %ymm9 + vpxor %ymm2, %ymm9, %ymm9 vpslld $5, %ymm1, %ymm1 - vpxor %ymm1, \r0, \r0 - vpaddd \r0, %ymm6, %ymm6 - vpaddd %ymm6, \r4, \r0 -.endm - -sha256d_ms_8way_avx2_finish: - sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 - sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 - sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 - sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 + vpxor %ymm1, %ymm9, %ymm9 + vpaddd %ymm9, %ymm6, %ymm6 + vpaddd %ymm6, %ymm4, %ymm9 + vpaddd 32*58(%rax), %ymm8, %ymm6 + vpaddd 32*58(%rcx), %ymm6, %ymm6 + vpandn %ymm0, %ymm9, %ymm1 + vpand %ymm9, %ymm10, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, %ymm9, %ymm1 + vpsrld $6, %ymm9, %ymm8 + vpsrld $5, %ymm8, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm6, %ymm5, %ymm8 + vpaddd 32*59(%rax), %ymm0, %ymm6 + vpaddd 32*59(%rcx), %ymm6, %ymm6 + vpandn %ymm10, %ymm8, %ymm1 + vpand %ymm8, %ymm9, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, %ymm8, %ymm1 + vpsrld $6, %ymm8, %ymm0 + vpsrld $5, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpaddd %ymm0, %ymm6, %ymm6 + vpaddd %ymm6, %ymm7, %ymm0 + vpaddd 32*60(%rax), %ymm10, %ymm6 + vpaddd 32*60(%rcx), %ymm6, %ymm6 + vpandn %ymm9, %ymm0, %ymm1 + vpand %ymm0, %ymm8, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, %ymm0, %ymm1 + vpsrld $6, %ymm0, %ymm10 + vpsrld $5, %ymm10, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, %ymm10, %ymm10 + vpxor %ymm2, %ymm10, %ymm10 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, %ymm10, %ymm10 + vpaddd %ymm10, %ymm6, %ymm6 + vpaddd %ymm6, %ymm3, %ymm10 vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 vmovdqa %ymm10, 224(%rdi) diff --git a/sha2-x64.S.orig b/sha2-x64.S.orig new file mode 100644 index 000000000..770d3ba29 --- /dev/null +++ b/sha2-x64.S.orig @@ -0,0 +1,4222 @@ +/* + * Copyright 2012-2015 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(USE_ASM) && defined(__x86_64__) + + .data + .p2align 4 +sha256_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + + .data + .p2align 6 +sha256_k: + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +bswap_xmm_mask: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + + +.macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3 + movdqa \x3, %xmm4 + movl \re, %eax + movdqa \x2, %xmm6 + rorl $(25-11), %eax + movl \ra, %ebx + pslldq $12, %xmm4 + rorl $(22-13), %ebx + psrldq $4, %xmm6 + xorl \re, %eax + movl \rf, %ecx + rorl $(11-6), %eax + pxor %xmm6, %xmm4 + movdqa \x1, %xmm5 + xorl \ra, %ebx + xorl \rg, %ecx + xorl \re, %eax + paddd \x0, %xmm4 + movdqa \x0, %xmm7 + andl \re, %ecx + rorl $(13-2), %ebx + xorl \ra, %ebx + pslldq $12, %xmm5 + psrldq $4, %xmm7 + rorl $6, %eax + xorl \rg, %ecx + pxor %xmm7, %xmm5 + rorl $2, %ebx + addl %eax, %ecx + addl (%rsp) , %ecx + movdqa %xmm5, %xmm6 + movl \ra, %eax + addl %ecx, \rh + movl \ra, %ecx + movdqa %xmm5, %xmm7 + orl \rc, %eax + addl \rh, \rd + andl \rc, %ecx + pslld $(32-7), %xmm5 + psrld $7, %xmm6 + andl \rb, %eax + addl %ebx, \rh + orl %ecx, %eax + por %xmm6, %xmm5 + addl %eax, \rh + + movl \rd, %eax + movdqa %xmm7, %xmm6 + movl \rh, %ebx + rorl $(25-11), %eax + xorl \rd, %eax + movdqa %xmm7, %xmm8 + movl \re, %ecx + rorl $(22-13), %ebx + xorl \rh, %ebx + pslld $(32-18), %xmm7 + rorl $(11-6), %eax + xorl \rf, %ecx + rorl $(13-2), %ebx + psrld $18, %xmm6 + xorl \rd, %eax + andl \rd, %ecx + rorl $6, %eax + pxor %xmm7, %xmm5 + xorl \rh, %ebx + xorl \rf, %ecx + psrld $3, %xmm8 + addl %eax, %ecx + addl 1*4(%rsp), %ecx + rorl $2, %ebx + pxor %xmm6, %xmm5 + movl \rh, %eax + addl %ecx, \rg + movl \rh, %ecx + pxor %xmm8, %xmm5 + orl \rb, %eax + addl \rg, \rc + andl \rb, %ecx + pshufd $0xfa, \x3, %xmm6 + andl \ra, %eax + addl %ebx, \rg + paddd %xmm5, %xmm4 + orl %ecx, %eax + addl %eax, \rg + + movl \rc, %eax + movdqa %xmm6, %xmm7 + movl \rg, %ebx + rorl $(25-11), %eax + xorl \rc, %eax + movdqa %xmm6, %xmm8 + rorl $(22-13), %ebx + movl \rd, %ecx + xorl \rg, %ebx + psrlq $17, %xmm6 + psrlq $19, %xmm7 + rorl $(11-6), %eax + xorl \re, %ecx + xorl \rc, %eax + psrld $10, %xmm8 + pxor %xmm7, %xmm6 + andl \rc, %ecx + rorl $(13-2), %ebx + xorl \rg, %ebx + pxor %xmm6, %xmm8 + xorl \re, %ecx + rorl $6, %eax + addl %eax, %ecx + pshufd $0x8f, %xmm8, %xmm8 + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl \rg, %eax + psrldq $8, %xmm8 + addl %ecx, \rf + movl \rg, %ecx + orl \ra, %eax + paddd %xmm8, %xmm4 + addl \rf, \rb + andl \ra, %ecx + andl \rh, %eax + pshufd $0x50, %xmm4, %xmm6 + addl %ebx, \rf + orl %ecx, %eax + addl %eax, \rf + + movdqa %xmm6, %xmm7 + movl \rb, %eax + rorl $(25-11), %eax + movl \rf, %ebx + movdqa %xmm6, \x0 + rorl $(22-13), %ebx + xorl \rb, %eax + movl \rc, %ecx + psrlq $17, %xmm6 + rorl $(11-6), %eax + xorl \rf, %ebx + xorl \rd, %ecx + psrlq $19, %xmm7 + xorl \rb, %eax + andl \rb, %ecx + rorl $(13-2), %ebx + psrld $10, \x0 + xorl \rf, %ebx + rorl $6, %eax + pxor %xmm7, %xmm6 + xorl \rd, %ecx + rorl $2, %ebx + addl %eax, %ecx + pxor %xmm6, \x0 + addl 3*4(%rsp), %ecx + movl \rf, %eax + addl %ecx, \re + pshufd $0xf8, \x0, \x0 + movl \rf, %ecx + orl \rh, %eax + addl \re, \ra + pslldq $8, \x0 + andl \rh, %ecx + andl \rg, %eax + paddd %xmm4, \x0 + addl %ebx, \re + orl %ecx, %eax + addl %eax, \re +.endm + +.macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh + movl \re, %eax + rorl $(25-11), %eax + movl \ra, %ebx + xorl \re, %eax + rorl $(22-13), %ebx + movl \rf, %ecx + xorl \ra, %ebx + rorl $(11-6), %eax + xorl \rg, %ecx + xorl \re, %eax + rorl $(13-2), %ebx + andl \re, %ecx + xorl \ra, %ebx + rorl $6, %eax + xorl \rg, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl \i*4(%rsp), %ecx + movl \ra, %eax + addl %ecx, \rh + movl \ra, %ecx + orl \rc, %eax + addl \rh, \rd + andl \rc, %ecx + andl \rb, %eax + addl %ebx, \rh + orl %ecx, %eax + addl %eax, \rh +.endm + + + .text + .p2align 6 +sha256_transform_sse2: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + subq $5*16, %rsp + movdqa %xmm6, 1*16(%rsp) + movdqa %xmm7, 2*16(%rsp) + movdqa %xmm8, 3*16(%rsp) + movdqa %xmm9, 4*16(%rsp) + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#else + subq $16, %rsp +#endif + + movl 0*4(%rdi), %r8d + movl 1*4(%rdi), %r9d + movl 2*4(%rdi), %r10d + movl 3*4(%rdi), %r11d + movl 4*4(%rdi), %r12d + movl 5*4(%rdi), %r13d + movl 6*4(%rdi), %r14d + movl 7*4(%rdi), %r15d + + testq %rdx, %rdx + jnz sha256_transform_sse2_swap + + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + jmp sha256_transform_sse2_core + +sha256_transform_sse2_swap: + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm1, %xmm1 + pshuflw $0xb1, %xmm2, %xmm2 + pshuflw $0xb1, %xmm3, %xmm3 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm1, %xmm1 + pshufhw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm3, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + psrlw $8, %xmm4 + psrlw $8, %xmm5 + psrlw $8, %xmm6 + psrlw $8, %xmm7 + psllw $8, %xmm0 + psllw $8, %xmm1 + psllw $8, %xmm2 + psllw $8, %xmm3 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + +sha256_transform_sse2_core: + leaq sha256_k(%rip), %rdx + movq $48, %rsi + .p2align 4 +sha256_transform_sse2_loop: + movdqa 0*16(%rdx), %xmm9 + paddd %xmm0, %xmm9 + movdqa %xmm9, (%rsp) + sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3 + movdqa 1*16(%rdx), %xmm9 + paddd %xmm1, %xmm9 + movdqa %xmm9, (%rsp) + sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0 + movdqa 2*16(%rdx), %xmm9 + paddd %xmm2, %xmm9 + movdqa %xmm9, (%rsp) + sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1 + movdqa 3*16(%rdx), %xmm9 + paddd %xmm3, %xmm9 + movdqa %xmm9, (%rsp) + addq $4*16, %rdx + sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2 + + subq $16, %rsi + jne sha256_transform_sse2_loop + + paddd 0*16(%rdx), %xmm0 + movdqa %xmm0, (%rsp) + sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d + sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d + sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d + sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d + paddd 1*16(%rdx), %xmm1 + movdqa %xmm1, (%rsp) + sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d + sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d + sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d + sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d + paddd 2*16(%rdx), %xmm2 + movdqa %xmm2, (%rsp) + sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d + sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d + sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d + sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d + paddd 3*16(%rdx), %xmm3 + movdqa %xmm3, (%rsp) + sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d + sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d + sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d + sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d + + addl %r8d, 0*4(%rdi) + addl %r9d, 1*4(%rdi) + addl %r10d, 2*4(%rdi) + addl %r11d, 3*4(%rdi) + addl %r12d, 4*4(%rdi) + addl %r13d, 5*4(%rdi) + addl %r14d, 6*4(%rdi) + addl %r15d, 7*4(%rdi) + +#if defined(_WIN64) || defined(__CYGWIN__) + movdqa 1*16(%rsp), %xmm6 + movdqa 2*16(%rsp), %xmm7 + movdqa 3*16(%rsp), %xmm8 + movdqa 4*16(%rsp), %xmm9 + addq $5*16, %rsp + popq %rsi + popq %rdi +#else + addq $16, %rsp +#endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + + + .text + .p2align 6 +sha256_transform_phe: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64, %rsp + andq $-64, %rsp + + testq %rdx, %rdx + jnz sha256_transform_phe_noswap + + movl 0*4(%rsi), %eax + movl 1*4(%rsi), %ecx + movl 2*4(%rsi), %edx + movl 3*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 0*4(%rsp) + movl %ecx, 1*4(%rsp) + movl %edx, 2*4(%rsp) + movl %r9d, 3*4(%rsp) + movl 4*4(%rsi), %eax + movl 5*4(%rsi), %ecx + movl 6*4(%rsi), %edx + movl 7*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 4*4(%rsp) + movl %ecx, 5*4(%rsp) + movl %edx, 6*4(%rsp) + movl %r9d, 7*4(%rsp) + + movdqu 2*16(%rsi), %xmm0 + movdqu 3*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 2*16(%rsp) + movdqa %xmm2, 3*16(%rsp) + + jmp sha256_transform_phe_core + +sha256_transform_phe_noswap: + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + +sha256_transform_phe_core: + movq %rsp, %rsi + movq $-1, %rax + movq $1, %rcx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi +#endif + ret + + + .data + .p2align 3 +sha256_transform_addr: + .quad sha256_transform_sse2 + + .text + .p2align 3 + .globl sha256_transform + .globl _sha256_transform +sha256_transform: +_sha256_transform: + jmp *sha256_transform_addr(%rip) + + + .text + .p2align 6 + .globl sha256d_ms + .globl _sha256d_ms +sha256d_ms: +_sha256d_ms: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $32, %rsp + andq $-32, %rsp + + movdqa 0*16(%rdx), %xmm0 + movdqa 1*16(%rdx), %xmm1 + movdqa %xmm0, 0*16(%rdi) + movdqa %xmm1, 1*16(%rdi) + + movl 0*4(%rsi), %eax + movl 1*4(%rsi), %ecx + movl 2*4(%rsi), %edx + movl 3*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 0*4(%rsp) + movl %ecx, 1*4(%rsp) + movl %edx, 2*4(%rsp) + movl %r9d, 3*4(%rsp) + + movq %rsp, %rsi + movl $64, %eax + movl $80, %ecx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movdqa bswap_xmm_mask(%rip), %xmm1 + movdqa 0*16(%rdi), %xmm0 + movdqa 1*16(%rdi), %xmm2 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm2 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm2, 1*16(%rsp) + + movdqa sha256_h+0*16(%rip), %xmm0 + movdqa sha256_h+1*16(%rip), %xmm1 + movdqa %xmm0, 0*16(%rdi) + movdqa %xmm1, 1*16(%rdi) + + movq %rsp, %rsi + xorq %rax, %rax + movl $32, %ecx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi +#endif + ret + + + .data + .p2align 7 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_4preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_4preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_4preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_4preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + +#ifdef USE_AVX2 + + .data + .p2align 7 +sha256_8h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_8k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_8preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_8preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_8preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_8preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022 + +#endif /* USE_AVX2 */ + + + .text + .p2align 6 + .globl sha256_init_4way + .globl _sha256_init_4way +sha256_init_4way: +_sha256_init_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + movq %rcx, %rdi +#endif + movdqa sha256_4h+0(%rip), %xmm0 + movdqa sha256_4h+16(%rip), %xmm1 + movdqa sha256_4h+32(%rip), %xmm2 + movdqa sha256_4h+48(%rip), %xmm3 + movdqu %xmm0, 0(%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm3 + movdqu %xmm0, 64(%rdi) + movdqu %xmm1, 80(%rdi) + movdqu %xmm2, 96(%rdi) + movdqu %xmm3, 112(%rdi) +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rdi +#endif + ret + + +#ifdef USE_AVX2 + .text + .p2align 6 + .globl sha256_init_8way + .globl _sha256_init_8way +sha256_init_8way: +_sha256_init_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + movq %rcx, %rdi +#endif + vpbroadcastd sha256_4h+0(%rip), %ymm0 + vpbroadcastd sha256_4h+16(%rip), %ymm1 + vpbroadcastd sha256_4h+32(%rip), %ymm2 + vpbroadcastd sha256_4h+48(%rip), %ymm3 + vmovdqu %ymm0, 0*32(%rdi) + vmovdqu %ymm1, 1*32(%rdi) + vmovdqu %ymm2, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vpbroadcastd sha256_4h+64(%rip), %ymm0 + vpbroadcastd sha256_4h+80(%rip), %ymm1 + vpbroadcastd sha256_4h+96(%rip), %ymm2 + vpbroadcastd sha256_4h+112(%rip), %ymm3 + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm1, 5*32(%rdi) + vmovdqu %ymm2, 6*32(%rdi) + vmovdqu %ymm3, 7*32(%rdi) +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rdi +#endif + ret +#endif /* USE_AVX2 */ + + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-7)*16(%rax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_sse2_extend_doubleround i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa (\i-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-7)*16(%rax), %xmm0 + paddd (\i-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, \i*16(%rax) + movdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_sse2_main_round i + movdqa 16*(\i)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(\i)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + +.macro sha256_sse2_main_quadround i + sha256_sse2_main_round \i+0 + sha256_sse2_main_round \i+1 + sha256_sse2_main_round \i+2 + sha256_sse2_main_round \i+3 +.endm + + +#if defined(USE_AVX) + +.macro sha256_avx_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpsrld $7, %xmm3, %xmm1 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpsrld $2, %xmm1, %xmm1 + vpslld $2, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_avx_extend_doubleround i + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (\i-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, \i*16(%rax) + vmovdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, \r7, %xmm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %xmm1 + vpxor %xmm2, \r4, \r4 + vpxor %xmm1, \r4, \r4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, \r4, \r4 + vpxor %xmm1, \r4, \r4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_avx_main_quadround i + sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + +#endif /* USE_AVX */ + + +#if defined(USE_AVX2) + +.macro sha256_avx2_extend_round i + vmovdqa (\i-15)*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpsrld $10, %ymm3, %ymm3 + vpsrld $7, %ymm3, %ymm1 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpsrld $2, %ymm1, %ymm1 + vpslld $2, %ymm2, %ymm2 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, \i*32(%rax) +.endm + +.macro sha256_avx2_extend_doubleround i + vmovdqa (\i-15)*32(%rax), %ymm0 + vmovdqa (\i-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, \i*32(%rax) + vmovdqa %ymm7, (\i+1)*32(%rax) +.endm + +.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 32*(\i)(%rax), \r0, %ymm6 + vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 + + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 + + vpand \r6, \r5, %ymm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %ymm1 + vpxor \r4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, \r7, %ymm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, \r4, \r4 + vpaddd %ymm6, \r4, \r4 +.endm + +.macro sha256_avx2_main_quadround i + sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 +.endm + +#endif /* USE_AVX2 */ + + +#if defined(USE_XOP) + +.macro sha256_xop_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vprotd $15, %xmm3, %xmm1 + vprotd $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_xop_extend_doubleround i + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, \i*16(%rax) + vmovdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, \r7, %xmm1 + vprotd $19, \r7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, \r7, \r4 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_xop_main_quadround i + sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + +#endif /* USE_XOP */ + + + .text + .p2align 6 +sha256_transform_4way_core_sse2: + leaq 256(%rsp), %rcx + leaq 48*16(%rcx), %rax + movdqa -2*16(%rcx), %xmm3 + movdqa -1*16(%rcx), %xmm7 +sha256_transform_4way_sse2_extend_loop: + movdqa -15*16(%rcx), %xmm0 + movdqa -14*16(%rcx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd -16*16(%rcx), %xmm0 + paddd -15*16(%rcx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -7*16(%rcx), %xmm0 + paddd -6*16(%rcx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, (%rcx) + movdqa %xmm7, 16(%rcx) + addq $2*16, %rcx + cmpq %rcx, %rax + jne sha256_transform_4way_sse2_extend_loop + + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + + leaq sha256_4k(%rip), %rcx + xorq %rax, %rax +sha256_transform_4way_sse2_main_loop: + movdqa (%rsp, %rax), %xmm6 + paddd (%rcx, %rax), %xmm6 + paddd %xmm10, %xmm6 + + movdqa %xmm0, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, %xmm10 + movdqa %xmm8, %xmm2 + movdqa %xmm2, %xmm9 + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, %xmm8 + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addq $16, %rax + cmpq $16*64, %rax + jne sha256_transform_4way_sse2_main_loop + jmp sha256_transform_4way_finish + + +#if defined(USE_AVX) + .text + .p2align 6 +sha256_transform_4way_core_avx: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + sha256_avx_extend_doubleround 0 + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + sha256_avx_extend_doubleround 6 + sha256_avx_extend_doubleround 8 + sha256_avx_extend_doubleround 10 + sha256_avx_extend_doubleround 12 + sha256_avx_extend_doubleround 14 + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + sha256_avx_main_quadround 0 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_quadround 56 + sha256_avx_main_quadround 60 + jmp sha256_transform_4way_finish +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + .text + .p2align 6 +sha256_transform_4way_core_xop: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + sha256_xop_extend_doubleround 0 + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + sha256_xop_extend_doubleround 6 + sha256_xop_extend_doubleround 8 + sha256_xop_extend_doubleround 10 + sha256_xop_extend_doubleround 12 + sha256_xop_extend_doubleround 14 + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + sha256_xop_main_quadround 0 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_quadround 56 + sha256_xop_main_quadround 60 + jmp sha256_transform_4way_finish +#endif /* USE_XOP */ + + + .data + .p2align 3 +sha256_transform_4way_core_addr: + .quad 0x0 + +.macro p2bswap_rsi_rsp i + movdqu \i*16(%rsi), %xmm0 + movdqu (\i+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, \i*16(%rsp) + movdqa %xmm2, (\i+1)*16(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $1032, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_4way_swap + + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqu 4*16(%rsi), %xmm4 + movdqu 5*16(%rsi), %xmm5 + movdqu 6*16(%rsi), %xmm6 + movdqu 7*16(%rsi), %xmm7 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + movdqa %xmm4, 4*16(%rsp) + movdqa %xmm5, 5*16(%rsp) + movdqa %xmm6, 6*16(%rsp) + movdqa %xmm7, 7*16(%rsp) + movdqu 8*16(%rsi), %xmm0 + movdqu 9*16(%rsi), %xmm1 + movdqu 10*16(%rsi), %xmm2 + movdqu 11*16(%rsi), %xmm3 + movdqu 12*16(%rsi), %xmm4 + movdqu 13*16(%rsi), %xmm5 + movdqu 14*16(%rsi), %xmm6 + movdqu 15*16(%rsi), %xmm7 + movdqa %xmm0, 8*16(%rsp) + movdqa %xmm1, 9*16(%rsp) + movdqa %xmm2, 10*16(%rsp) + movdqa %xmm3, 11*16(%rsp) + movdqa %xmm4, 12*16(%rsp) + movdqa %xmm5, 13*16(%rsp) + movdqa %xmm6, 14*16(%rsp) + movdqa %xmm7, 15*16(%rsp) + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_swap: + p2bswap_rsi_rsp 0 + p2bswap_rsi_rsp 2 + p2bswap_rsi_rsp 4 + p2bswap_rsi_rsp 6 + p2bswap_rsi_rsp 8 + p2bswap_rsi_rsp 10 + p2bswap_rsi_rsp 12 + p2bswap_rsi_rsp 14 + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_finish: + movdqu 0(%rdi), %xmm2 + movdqu 16(%rdi), %xmm6 + movdqu 32(%rdi), %xmm11 + movdqu 48(%rdi), %xmm1 + paddd %xmm2, %xmm7 + paddd %xmm6, %xmm5 + paddd %xmm11, %xmm4 + paddd %xmm1, %xmm3 + movdqu 64(%rdi), %xmm2 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm11 + movdqu 112(%rdi), %xmm1 + paddd %xmm2, %xmm0 + paddd %xmm6, %xmm8 + paddd %xmm11, %xmm9 + paddd %xmm1, %xmm10 + + movdqu %xmm7, 0(%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm4, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm8, 80(%rdi) + movdqu %xmm9, 96(%rdi) + movdqu %xmm10, 112(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + + +#ifdef USE_AVX2 + + .text + .p2align 6 +sha256_transform_8way_core_avx2: + leaq 8*64(%rsp), %rax + vmovdqa -2*32(%rax), %ymm3 + vmovdqa -1*32(%rax), %ymm7 + sha256_avx2_extend_doubleround 0 + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + sha256_avx2_extend_doubleround 6 + sha256_avx2_extend_doubleround 8 + sha256_avx2_extend_doubleround 10 + sha256_avx2_extend_doubleround 12 + sha256_avx2_extend_doubleround 14 + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + vmovdqu 0*32(%rdi), %ymm7 + vmovdqu 1*32(%rdi), %ymm5 + vmovdqu 2*32(%rdi), %ymm4 + vmovdqu 3*32(%rdi), %ymm3 + vmovdqu 4*32(%rdi), %ymm0 + vmovdqu 5*32(%rdi), %ymm8 + vmovdqu 6*32(%rdi), %ymm9 + vmovdqu 7*32(%rdi), %ymm10 + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + sha256_avx2_main_quadround 0 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_quadround 56 + sha256_avx2_main_quadround 60 + jmp sha256_transform_8way_finish + +.macro p2bswap_avx2_rsi_rsp i + vmovdqu \i*32(%rsi), %ymm0 + vmovdqu (\i+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, \i*32(%rsp) + vmovdqa %ymm2, (\i+1)*32(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_8way + .globl _sha256_transform_8way +sha256_transform_8way: +_sha256_transform_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + vmovdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64*32, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_8way_swap + + vmovdqu 0*32(%rsi), %ymm0 + vmovdqu 1*32(%rsi), %ymm1 + vmovdqu 2*32(%rsi), %ymm2 + vmovdqu 3*32(%rsi), %ymm3 + vmovdqu 4*32(%rsi), %ymm4 + vmovdqu 5*32(%rsi), %ymm5 + vmovdqu 6*32(%rsi), %ymm6 + vmovdqu 7*32(%rsi), %ymm7 + vmovdqa %ymm0, 0*32(%rsp) + vmovdqa %ymm1, 1*32(%rsp) + vmovdqa %ymm2, 2*32(%rsp) + vmovdqa %ymm3, 3*32(%rsp) + vmovdqa %ymm4, 4*32(%rsp) + vmovdqa %ymm5, 5*32(%rsp) + vmovdqa %ymm6, 6*32(%rsp) + vmovdqa %ymm7, 7*32(%rsp) + vmovdqu 8*32(%rsi), %ymm0 + vmovdqu 9*32(%rsi), %ymm1 + vmovdqu 10*32(%rsi), %ymm2 + vmovdqu 11*32(%rsi), %ymm3 + vmovdqu 12*32(%rsi), %ymm4 + vmovdqu 13*32(%rsi), %ymm5 + vmovdqu 14*32(%rsi), %ymm6 + vmovdqu 15*32(%rsi), %ymm7 + vmovdqa %ymm0, 8*32(%rsp) + vmovdqa %ymm1, 9*32(%rsp) + vmovdqa %ymm2, 10*32(%rsp) + vmovdqa %ymm3, 11*32(%rsp) + vmovdqa %ymm4, 12*32(%rsp) + vmovdqa %ymm5, 13*32(%rsp) + vmovdqa %ymm6, 14*32(%rsp) + vmovdqa %ymm7, 15*32(%rsp) + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_swap: + p2bswap_avx2_rsi_rsp 0 + p2bswap_avx2_rsi_rsp 2 + p2bswap_avx2_rsi_rsp 4 + p2bswap_avx2_rsi_rsp 6 + p2bswap_avx2_rsi_rsp 8 + p2bswap_avx2_rsi_rsp 10 + p2bswap_avx2_rsi_rsp 12 + p2bswap_avx2_rsi_rsp 14 + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_finish: + vmovdqu 0*32(%rdi), %ymm2 + vmovdqu 1*32(%rdi), %ymm6 + vmovdqu 2*32(%rdi), %ymm11 + vmovdqu 3*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd %ymm11, %ymm4, %ymm4 + vpaddd %ymm1, %ymm3, %ymm3 + vmovdqu 4*32(%rdi), %ymm2 + vmovdqu 5*32(%rdi), %ymm6 + vmovdqu 6*32(%rdi), %ymm11 + vmovdqu 7*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm0, %ymm0 + vpaddd %ymm6, %ymm8, %ymm8 + vpaddd %ymm11, %ymm9, %ymm9 + vpaddd %ymm1, %ymm10, %ymm10 + + vmovdqu %ymm7, 0*32(%rdi) + vmovdqu %ymm5, 1*32(%rdi) + vmovdqu %ymm4, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm8, 5*32(%rdi) + vmovdqu %ymm9, 6*32(%rdi) + vmovdqu %ymm10, 7*32(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + vmovdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX2 */ + + + .data + .p2align 3 +sha256d_ms_4way_addr: + .quad 0x0 + + .text + .p2align 6 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + jmp *sha256d_ms_4way_addr(%rip) + + + .p2align 6 +sha256d_ms_4way_sse2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $32, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $8+67*16, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_sse2_extend_loop1: + movdqa 3*16(%rsi), %xmm0 + movdqa 2*16(%rax), %xmm3 + movdqa 3*16(%rax), %xmm7 + movdqa %xmm3, 5*16(%rsp) + movdqa %xmm7, 6*16(%rsp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%rax) + movdqa %xmm7, 3*16(%rax) + + movdqa 4*16(%rax), %xmm0 + movdqa %xmm0, 7*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%rax) + movdqa %xmm7, 5*16(%rax) + + movdqa 6*16(%rax), %xmm0 + movdqa 7*16(%rax), %xmm4 + movdqa %xmm0, 9*16(%rsp) + movdqa %xmm4, 10*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa 8*16(%rax), %xmm0 + movdqa 2*16(%rax), %xmm4 + movdqa %xmm0, 11*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa 14*16(%rax), %xmm0 + movdqa 15*16(%rax), %xmm4 + movdqa %xmm0, 17*16(%rsp) + movdqa %xmm4, 18*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + paddd 8*16(%rax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_sse2_extend_loop2: + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_sse2_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 + + movdqa 0(%rcx), %xmm3 + movdqa 16(%rcx), %xmm0 + movdqa 32(%rcx), %xmm1 + movdqa 48(%rcx), %xmm2 + movdqa 64(%rcx), %xmm6 + movdqa 80(%rcx), %xmm7 + movdqa 96(%rcx), %xmm5 + movdqa 112(%rcx), %xmm4 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop1 + +sha256d_ms_4way_sse2_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_sse2_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_quadround 4 + sha256_sse2_main_quadround 8 + sha256_sse2_main_quadround 12 + sha256_sse2_main_quadround 16 + sha256_sse2_main_quadround 20 + sha256_sse2_main_quadround 24 + sha256_sse2_main_quadround 28 + sha256_sse2_main_quadround 32 + sha256_sse2_main_quadround 36 + sha256_sse2_main_quadround 40 + sha256_sse2_main_quadround 44 + sha256_sse2_main_quadround 48 + sha256_sse2_main_quadround 52 + sha256_sse2_main_round 56 + jz sha256d_ms_4way_sse2_finish + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_quadround 60 + + movdqa 5*16(%rsp), %xmm1 + movdqa 6*16(%rsp), %xmm2 + movdqa 7*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 9*16(%rsp), %xmm1 + movdqa 10*16(%rsp), %xmm2 + movdqa 11*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 17*16(%rsp), %xmm1 + movdqa 18*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + movdqa 0(%rsp), %xmm1 + movdqa 16(%rsp), %xmm2 + movdqa 32(%rsp), %xmm6 + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm1 + paddd 96(%rdx), %xmm2 + paddd 112(%rdx), %xmm6 + + movdqa %xmm7, 48+0(%rsp) + movdqa %xmm5, 48+16(%rsp) + movdqa %xmm4, 48+32(%rsp) + movdqa %xmm3, 48+48(%rsp) + movdqa %xmm0, 48+64(%rsp) + movdqa %xmm1, 48+80(%rsp) + movdqa %xmm2, 48+96(%rsp) + movdqa %xmm6, 48+112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 48+128(%rsp) + movdqa %xmm0, 48+144(%rsp) + movdqa %xmm0, 48+160(%rsp) + movdqa %xmm0, 48+176(%rsp) + movdqa %xmm0, 48+192(%rsp) + movdqa %xmm0, 48+208(%rsp) + movdqa %xmm0, 48+224(%rsp) + movdqa %xmm1, 48+240(%rsp) + + leaq 19*16(%rsp), %rax + cmpq %rax, %rax + + movdqa -15*16(%rax), %xmm0 + movdqa -14*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%rax), %xmm0 + paddd -15*16(%rax), %xmm4 + paddd sha256d_4preext2_17(%rip), %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%rax) + movdqa %xmm7, 1*16(%rax) + + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + + movdqa -9*16(%rax), %xmm0 + movdqa sha256d_4preext2_23(%rip), %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%rax), %xmm0 + paddd -9*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 0*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa sha256d_4preext2_24(%rip), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%rax), %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa sha256d_4preext2_30(%rip), %xmm0 + movdqa 0*16(%rax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_sse2_extend_loop2 + +sha256d_ms_4way_sse2_extend_coda2: + sha256_sse2_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm6 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + leaq 48(%rsp), %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop2 + +.macro sha256_sse2_main_round_red i, r7 + movdqa 16*\i(%rax), %xmm6 + paddd 16*\i(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd \r7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 +.endm + +sha256d_ms_4way_sse2_finish: + sha256_sse2_main_round_red 57, %xmm3 + sha256_sse2_main_round_red 58, %xmm4 + sha256_sse2_main_round_red 59, %xmm5 + sha256_sse2_main_round_red 60, %xmm7 + + paddd sha256_4h+112(%rip), %xmm0 + movdqa %xmm0, 112(%rdi) + + addq $8+67*16, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + addq $32, %rsp + popq %rdi +#endif + ret + + +#if defined(USE_AVX) + + .p2align 6 +sha256d_ms_4way_avx: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_avx_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_avx_extend_loop2: + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + jz sha256d_ms_4way_avx_extend_coda2 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop1 + +sha256d_ms_4way_avx_main_loop2: + sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_avx_main_loop1: + sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_avx_finish + sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 60 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + + vmovdqa -9*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpsrld $7, %xmm0, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpsrld $11, %xmm5, %xmm5 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_avx_extend_loop2 + +sha256d_ms_4way_avx_extend_coda2: + sha256_avx_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop2 + +.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 16*\i(%rax), \r0, %xmm6 + vpaddd 16*\i(%rcx), %xmm6, %xmm6 + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 +.endm + +sha256d_ms_4way_avx_finish: + sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 + sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 + sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 + sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + + .p2align 6 +sha256d_ms_4way_xop: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_xop_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_xop_extend_loop2: + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + jz sha256d_ms_4way_xop_extend_coda2 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop1 + +sha256d_ms_4way_xop_main_loop2: + sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_xop_main_loop1: + sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_xop_finish + sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 60 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + + vmovdqa -9*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_xop_extend_loop2 + +sha256d_ms_4way_xop_extend_coda2: + sha256_xop_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop2 + +.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 16*\i(%rax), \r0, %xmm6 + vpaddd 16*\i(%rcx), %xmm6, %xmm6 + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 +.endm + +sha256d_ms_4way_xop_finish: + sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 + sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 + sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 + sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_XOP */ + + + .text + .p2align 6 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushq %rbx + pushq %rcx + pushq %rdx + + /* Check for VIA PadLock Hash Engine */ + movl $0xc0000000, %eax + cpuid + cmpl $0xc0000001, %eax + jb sha256_use_4way_no_phe + movl $0xc0000001, %eax + cpuid + andl $0x00000c00, %edx + cmpl $0x00000c00, %edx + jne sha256_use_4way_no_phe + leaq sha256_transform_phe(%rip), %rdx + movq %rdx, sha256_transform_addr(%rip) + xorl %eax, %eax + jmp sha256_use_4way_exit +sha256_use_4way_no_phe: +#if defined(USE_AVX) + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_4way_base + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_4way_base +#if defined(USE_XOP) + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jz sha256_use_4way_avx + +sha256_use_4way_xop: + leaq sha256d_ms_4way_xop(%rip), %rcx + leaq sha256_transform_4way_core_xop(%rip), %rdx + jmp sha256_use_4way_done +#endif /* USE_XOP */ + +sha256_use_4way_avx: + leaq sha256d_ms_4way_avx(%rip), %rcx + leaq sha256_transform_4way_core_avx(%rip), %rdx + jmp sha256_use_4way_done +#endif /* USE_AVX */ + +sha256_use_4way_base: + leaq sha256d_ms_4way_sse2(%rip), %rcx + leaq sha256_transform_4way_core_sse2(%rip), %rdx + +sha256_use_4way_done: + movq %rcx, sha256d_ms_4way_addr(%rip) + movq %rdx, sha256_transform_4way_core_addr(%rip) + movl $1, %eax +sha256_use_4way_exit: + popq %rdx + popq %rcx + popq %rbx + ret + + +#if defined(USE_AVX2) + + .text + .p2align 6 + .globl sha256d_ms_8way + .globl _sha256d_ms_8way +sha256d_ms_8way: +_sha256d_ms_8way: +sha256d_ms_8way_avx2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + pushq %rbp + movq %rsp, %rbp + subq $64*32, %rsp + andq $-128, %rsp + + leaq 16*32(%rsi), %rax + +sha256d_ms_8way_avx2_extend_loop1: + vmovdqa 3*32(%rsi), %ymm0 + vmovdqa 2*32(%rax), %ymm3 + vmovdqa 3*32(%rax), %ymm7 + vmovdqa %ymm3, 2*32(%rsp) + vmovdqa %ymm7, 3*32(%rsp) + vpaddd %ymm0, %ymm7, %ymm7 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, 3*32(%rax) + + vmovdqa 4*32(%rax), %ymm0 + vmovdqa %ymm0, 4*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, 5*32(%rax) + + vmovdqa 6*32(%rax), %ymm0 + vmovdqa 7*32(%rax), %ymm4 + vmovdqa %ymm0, 6*32(%rsp) + vmovdqa %ymm4, 7*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vmovdqa 8*32(%rax), %ymm0 + vmovdqa 2*32(%rax), %ymm4 + vmovdqa %ymm0, 8*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa 14*32(%rax), %ymm0 + vmovdqa 15*32(%rax), %ymm4 + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm4, 15*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + +sha256d_ms_8way_avx2_extend_loop2: + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + jz sha256d_ms_8way_avx2_extend_coda2 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + + vmovdqa 0(%rcx), %ymm7 + vmovdqa 32(%rcx), %ymm8 + vmovdqa 64(%rcx), %ymm9 + vmovdqa 96(%rcx), %ymm10 + vmovdqa 128(%rcx), %ymm0 + vmovdqa 160(%rcx), %ymm5 + vmovdqa 192(%rcx), %ymm4 + vmovdqa 224(%rcx), %ymm3 + + movq %rsi, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop1 + +sha256d_ms_8way_avx2_main_loop2: + sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 +sha256d_ms_8way_avx2_main_loop1: + sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + jz sha256d_ms_8way_avx2_finish + sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 60 + + vmovdqa 2*32(%rsp), %ymm1 + vmovdqa 3*32(%rsp), %ymm2 + vmovdqa 4*32(%rsp), %ymm6 + vmovdqa %ymm1, 18*32(%rsi) + vmovdqa %ymm2, 19*32(%rsi) + vmovdqa %ymm6, 20*32(%rsi) + vmovdqa 6*32(%rsp), %ymm1 + vmovdqa 7*32(%rsp), %ymm2 + vmovdqa 8*32(%rsp), %ymm6 + vmovdqa %ymm1, 22*32(%rsi) + vmovdqa %ymm2, 23*32(%rsi) + vmovdqa %ymm6, 24*32(%rsi) + vmovdqa 14*32(%rsp), %ymm1 + vmovdqa 15*32(%rsp), %ymm2 + vmovdqa %ymm1, 30*32(%rsi) + vmovdqa %ymm2, 31*32(%rsi) + + vpaddd 0(%rdx), %ymm7, %ymm7 + vpaddd 32(%rdx), %ymm5, %ymm5 + vpaddd 64(%rdx), %ymm4, %ymm4 + vpaddd 96(%rdx), %ymm3, %ymm3 + vpaddd 128(%rdx), %ymm0, %ymm0 + vpaddd 160(%rdx), %ymm8, %ymm8 + vpaddd 192(%rdx), %ymm9, %ymm9 + vpaddd 224(%rdx), %ymm10, %ymm10 + + vmovdqa %ymm7, 0(%rsp) + vmovdqa %ymm5, 32(%rsp) + vmovdqa %ymm4, 64(%rsp) + vmovdqa %ymm3, 96(%rsp) + vmovdqa %ymm0, 128(%rsp) + vmovdqa %ymm8, 160(%rsp) + vmovdqa %ymm9, 192(%rsp) + vmovdqa %ymm10, 224(%rsp) + + vpxor %ymm0, %ymm0, %ymm0 + movq $0x8000000000000100, %rax + vmovd %rax, %xmm1 + vinserti128 $1, %xmm1, %ymm1, %ymm1 + vpshufd $0x55, %ymm1, %ymm2 + vpshufd $0x00, %ymm1, %ymm1 + vmovdqa %ymm2, 8*32(%rsp) + vmovdqa %ymm0, 9*32(%rsp) + vmovdqa %ymm0, 10*32(%rsp) + vmovdqa %ymm0, 11*32(%rsp) + vmovdqa %ymm0, 12*32(%rsp) + vmovdqa %ymm0, 13*32(%rsp) + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm1, 15*32(%rsp) + + leaq 16*32(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*32(%rax), %ymm0 + vmovdqa -14*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd -16*32(%rax), %ymm8, %ymm3 + vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7 + vmovdqa %ymm3, 0*32(%rax) + vmovdqa %ymm7, 1*32(%rax) + + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + + vmovdqa -9*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm8 + vpsrld $7, %ymm0, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4 + vpaddd -10*32(%rax), %ymm8, %ymm0 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd -1*32(%rax), %ymm0, %ymm0 + vpaddd 0*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3 + vpaddd 1*32(%rax), %ymm3, %ymm3 + vpaddd 2*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa sha256d_8preext2_30(%rip), %ymm0 + vmovdqa 0*32(%rax), %ymm4 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm4, %ymm4 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrld $11, %ymm5, %ymm5 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd -1*32(%rax), %ymm4, %ymm4 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + + jmp sha256d_ms_8way_avx2_extend_loop2 + +sha256d_ms_8way_avx2_extend_coda2: + sha256_avx2_extend_round 44 + + vmovdqa sha256_8h+0(%rip), %ymm7 + vmovdqa sha256_8h+32(%rip), %ymm5 + vmovdqa sha256_8h+64(%rip), %ymm4 + vmovdqa sha256_8h+96(%rip), %ymm3 + vmovdqa sha256_8h+128(%rip), %ymm0 + vmovdqa sha256_8h+160(%rip), %ymm8 + vmovdqa sha256_8h+192(%rip), %ymm9 + vmovdqa sha256_8h+224(%rip), %ymm10 + + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop2 + +.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 32*\i(%rax), \r0, %ymm6 + vpaddd 32*\i(%rcx), %ymm6, %ymm6 + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 +.endm + +sha256d_ms_8way_avx2_finish: + sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 + sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 + sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 + sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 + + vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 + vmovdqa %ymm10, 224(%rdi) + + movq %rbp, %rsp + popq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + + .text + .p2align 6 + .globl sha256_use_8way + .globl _sha256_use_8way +sha256_use_8way: +_sha256_use_8way: + pushq %rbx + + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_8way_no + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne sha256_use_8way_no + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_8way_no + +sha256_use_8way_yes: + movl $1, %eax + jmp sha256_use_8way_done + +sha256_use_8way_no: + xorl %eax, %eax + +sha256_use_8way_done: + popq %rbx + ret + +#endif /* USE_AVX2 */ + +#endif diff --git a/sha2-x86.S b/sha2-x86.S index e2eb112a9..cedc332c1 100644 --- a/sha2-x86.S +++ b/sha2-x86.S @@ -135,189 +135,11 @@ _sha256_init_4way: ret -.macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%eax), %xmm0 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd (\i-16)*16(%eax), %xmm0 - paddd (\i-7)*16(%eax), %xmm0 - - movdqa %xmm3, %xmm2 - psrld $10, %xmm3 - pslld $13, %xmm2 - movdqa %xmm3, %xmm1 - psrld $7, %xmm1 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - psrld $2, %xmm1 - pslld $2, %xmm2 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - paddd %xmm0, %xmm3 - movdqa %xmm3, \i*16(%eax) -.endm - -.macro sha256_sse2_extend_doubleround i - movdqa (\i-15)*16(%eax), %xmm0 - movdqa (\i-14)*16(%eax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - - paddd (\i-16)*16(%eax), %xmm0 - paddd (\i-15)*16(%eax), %xmm4 - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - - paddd (\i-7)*16(%eax), %xmm0 - paddd (\i-6)*16(%eax), %xmm4 - - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, \i*16(%eax) - movdqa %xmm7, (\i+1)*16(%eax) -.endm - -.macro sha256_sse2_main_round i - movdqa 16*(\i)(%eax), %xmm6 - - movdqa %xmm0, %xmm1 - movdqa 16(%esp), %xmm2 - pandn %xmm2, %xmm1 - paddd 32(%esp), %xmm6 - - movdqa %xmm2, 32(%esp) - movdqa 0(%esp), %xmm2 - movdqa %xmm2, 16(%esp) - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%esp) - - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - paddd 16*(\i)+sha256_4k, %xmm6 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pslld $5, %xmm1 - pxor %xmm2, %xmm0 - pxor %xmm1, %xmm0 - movdqa %xmm5, %xmm1 - paddd %xmm0, %xmm6 - movdqa %xmm3, %xmm0 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - paddd %xmm6, %xmm0 - pand %xmm5, %xmm2 - pand %xmm7, %xmm1 - pand %xmm7, %xmm4 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pslld $9, %xmm2 - pxor %xmm1, %xmm7 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pslld $11, %xmm2 - pxor %xmm1, %xmm7 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 -.endm -.macro sha256_sse2_main_quadround i - sha256_sse2_main_round \i+0 - sha256_sse2_main_round \i+1 - sha256_sse2_main_round \i+2 - sha256_sse2_main_round \i+3 -.endm -.macro p2bswap_esi_esp i - movdqu \i*16(%esi), %xmm0 - movdqu (\i+1)*16(%esi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, (\i+3)*16(%esp) - movdqa %xmm2, (\i+4)*16(%esp) -.endm .text .p2align 5 @@ -373,14 +195,134 @@ _sha256_transform_4way: .p2align 5 sha256_transform_4way_swap: - p2bswap_esi_esp 0 - p2bswap_esi_esp 2 - p2bswap_esi_esp 4 - p2bswap_esi_esp 6 - p2bswap_esi_esp 8 - p2bswap_esi_esp 10 - p2bswap_esi_esp 12 - p2bswap_esi_esp 14 + movdqu 0*16(%esi), %xmm0 + movdqu (0+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (0+3)*16(%esp) + movdqa %xmm2, (0+4)*16(%esp) + movdqu 2*16(%esi), %xmm0 + movdqu (2+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (2+3)*16(%esp) + movdqa %xmm2, (2+4)*16(%esp) + movdqu 4*16(%esi), %xmm0 + movdqu (4+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (4+3)*16(%esp) + movdqa %xmm2, (4+4)*16(%esp) + movdqu 6*16(%esi), %xmm0 + movdqu (6+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (6+3)*16(%esp) + movdqa %xmm2, (6+4)*16(%esp) + movdqu 8*16(%esi), %xmm0 + movdqu (8+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (8+3)*16(%esp) + movdqa %xmm2, (8+4)*16(%esp) + movdqu 10*16(%esi), %xmm0 + movdqu (10+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (10+3)*16(%esp) + movdqa %xmm2, (10+4)*16(%esp) + movdqu 12*16(%esi), %xmm0 + movdqu (12+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (12+3)*16(%esp) + movdqa %xmm2, (12+4)*16(%esp) + movdqu 14*16(%esi), %xmm0 + movdqu (14+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (14+3)*16(%esp) + movdqa %xmm2, (14+4)*16(%esp) sha256_transform_4way_extend: leal 19*16(%esp), %ecx @@ -784,64 +726,4874 @@ sha256d_ms_4way_extend_loop1: movdqa %xmm7, 15*16(%eax) sha256d_ms_4way_extend_loop2: - sha256_sse2_extend_doubleround 16 - sha256_sse2_extend_doubleround 18 - sha256_sse2_extend_doubleround 20 - sha256_sse2_extend_doubleround 22 - sha256_sse2_extend_doubleround 24 - sha256_sse2_extend_doubleround 26 - sha256_sse2_extend_doubleround 28 - sha256_sse2_extend_doubleround 30 - sha256_sse2_extend_doubleround 32 - sha256_sse2_extend_doubleround 34 - sha256_sse2_extend_doubleround 36 - sha256_sse2_extend_doubleround 38 - sha256_sse2_extend_doubleround 40 - sha256_sse2_extend_doubleround 42 - jz sha256d_ms_4way_extend_coda2 - sha256_sse2_extend_doubleround 44 - sha256_sse2_extend_doubleround 46 - - movdqa 0(%ecx), %xmm3 - movdqa 16(%ecx), %xmm0 - movdqa 32(%ecx), %xmm1 - movdqa 48(%ecx), %xmm2 - movdqa 64(%ecx), %xmm6 - movdqa 80(%ecx), %xmm7 - movdqa 96(%ecx), %xmm5 - movdqa 112(%ecx), %xmm4 - movdqa %xmm1, 0(%esp) - movdqa %xmm2, 16(%esp) + movdqa (16-15)*16(%eax), %xmm0 + movdqa (16-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (16-16)*16(%eax), %xmm0 + paddd (16-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (16-7)*16(%eax), %xmm0 + paddd (16-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 16*16(%eax) + movdqa %xmm7, (16+1)*16(%eax) + movdqa (18-15)*16(%eax), %xmm0 + movdqa (18-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (18-16)*16(%eax), %xmm0 + paddd (18-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (18-7)*16(%eax), %xmm0 + paddd (18-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 18*16(%eax) + movdqa %xmm7, (18+1)*16(%eax) + movdqa (20-15)*16(%eax), %xmm0 + movdqa (20-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (20-16)*16(%eax), %xmm0 + paddd (20-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (20-7)*16(%eax), %xmm0 + paddd (20-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 20*16(%eax) + movdqa %xmm7, (20+1)*16(%eax) + movdqa (22-15)*16(%eax), %xmm0 + movdqa (22-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (22-16)*16(%eax), %xmm0 + paddd (22-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (22-7)*16(%eax), %xmm0 + paddd (22-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 22*16(%eax) + movdqa %xmm7, (22+1)*16(%eax) + movdqa (24-15)*16(%eax), %xmm0 + movdqa (24-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (24-16)*16(%eax), %xmm0 + paddd (24-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (24-7)*16(%eax), %xmm0 + paddd (24-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 24*16(%eax) + movdqa %xmm7, (24+1)*16(%eax) + movdqa (26-15)*16(%eax), %xmm0 + movdqa (26-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (26-16)*16(%eax), %xmm0 + paddd (26-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (26-7)*16(%eax), %xmm0 + paddd (26-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 26*16(%eax) + movdqa %xmm7, (26+1)*16(%eax) + movdqa (28-15)*16(%eax), %xmm0 + movdqa (28-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (28-16)*16(%eax), %xmm0 + paddd (28-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (28-7)*16(%eax), %xmm0 + paddd (28-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 28*16(%eax) + movdqa %xmm7, (28+1)*16(%eax) + movdqa (30-15)*16(%eax), %xmm0 + movdqa (30-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (30-16)*16(%eax), %xmm0 + paddd (30-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (30-7)*16(%eax), %xmm0 + paddd (30-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 30*16(%eax) + movdqa %xmm7, (30+1)*16(%eax) + movdqa (32-15)*16(%eax), %xmm0 + movdqa (32-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (32-16)*16(%eax), %xmm0 + paddd (32-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (32-7)*16(%eax), %xmm0 + paddd (32-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 32*16(%eax) + movdqa %xmm7, (32+1)*16(%eax) + movdqa (34-15)*16(%eax), %xmm0 + movdqa (34-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (34-16)*16(%eax), %xmm0 + paddd (34-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (34-7)*16(%eax), %xmm0 + paddd (34-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 34*16(%eax) + movdqa %xmm7, (34+1)*16(%eax) + movdqa (36-15)*16(%eax), %xmm0 + movdqa (36-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (36-16)*16(%eax), %xmm0 + paddd (36-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (36-7)*16(%eax), %xmm0 + paddd (36-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 36*16(%eax) + movdqa %xmm7, (36+1)*16(%eax) + movdqa (38-15)*16(%eax), %xmm0 + movdqa (38-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (38-16)*16(%eax), %xmm0 + paddd (38-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (38-7)*16(%eax), %xmm0 + paddd (38-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 38*16(%eax) + movdqa %xmm7, (38+1)*16(%eax) + movdqa (40-15)*16(%eax), %xmm0 + movdqa (40-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (40-16)*16(%eax), %xmm0 + paddd (40-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (40-7)*16(%eax), %xmm0 + paddd (40-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 40*16(%eax) + movdqa %xmm7, (40+1)*16(%eax) + movdqa (42-15)*16(%eax), %xmm0 + movdqa (42-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (42-16)*16(%eax), %xmm0 + paddd (42-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (42-7)*16(%eax), %xmm0 + paddd (42-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 42*16(%eax) + movdqa %xmm7, (42+1)*16(%eax) + jz sha256d_ms_4way_extend_coda2 + movdqa (44-15)*16(%eax), %xmm0 + movdqa (44-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (44-16)*16(%eax), %xmm0 + paddd (44-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (44-7)*16(%eax), %xmm0 + paddd (44-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 44*16(%eax) + movdqa %xmm7, (44+1)*16(%eax) + movdqa (46-15)*16(%eax), %xmm0 + movdqa (46-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (46-16)*16(%eax), %xmm0 + paddd (46-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (46-7)*16(%eax), %xmm0 + paddd (46-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 46*16(%eax) + movdqa %xmm7, (46+1)*16(%eax) + + movdqa 0(%ecx), %xmm3 + movdqa 16(%ecx), %xmm0 + movdqa 32(%ecx), %xmm1 + movdqa 48(%ecx), %xmm2 + movdqa 64(%ecx), %xmm6 + movdqa 80(%ecx), %xmm7 + movdqa 96(%ecx), %xmm5 + movdqa 112(%ecx), %xmm4 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) movdqa %xmm6, 32(%esp) movl %esi, %eax jmp sha256d_ms_4way_main_loop1 sha256d_ms_4way_main_loop2: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 + movdqa 16*(0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 sha256d_ms_4way_main_loop1: - sha256_sse2_main_round 3 - sha256_sse2_main_quadround 4 - sha256_sse2_main_quadround 8 - sha256_sse2_main_quadround 12 - sha256_sse2_main_quadround 16 - sha256_sse2_main_quadround 20 - sha256_sse2_main_quadround 24 - sha256_sse2_main_quadround 28 - sha256_sse2_main_quadround 32 - sha256_sse2_main_quadround 36 - sha256_sse2_main_quadround 40 - sha256_sse2_main_quadround 44 - sha256_sse2_main_quadround 48 - sha256_sse2_main_quadround 52 - sha256_sse2_main_round 56 + movdqa 16*(3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(4+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(4+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(8+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(8+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(12+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(12+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(16+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(16+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(20+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(20+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(24+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(24+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(28+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(28+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(32+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(32+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(36+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(36+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(40+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(40+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(44+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(44+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(48+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(48+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(52+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(52+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(56)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(56)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 jz sha256d_ms_4way_finish - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_quadround 60 + movdqa 16*(57)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(57)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(58)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(58)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(59)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(59)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+0)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+0)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+1)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+1)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+2)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+2)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + movdqa 16*(60+3)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(60+3)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 movdqa 5*16(%esp), %xmm1 movdqa 6*16(%esp), %xmm2 @@ -928,8 +5680,124 @@ sha256d_ms_4way_main_loop1: movdqa %xmm3, 0*16(%eax) movdqa %xmm7, 1*16(%eax) - sha256_sse2_extend_doubleround 2 - sha256_sse2_extend_doubleround 4 + movdqa (2-15)*16(%eax), %xmm0 + movdqa (2-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (2-16)*16(%eax), %xmm0 + paddd (2-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (2-7)*16(%eax), %xmm0 + paddd (2-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 2*16(%eax) + movdqa %xmm7, (2+1)*16(%eax) + movdqa (4-15)*16(%eax), %xmm0 + movdqa (4-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (4-16)*16(%eax), %xmm0 + paddd (4-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (4-7)*16(%eax), %xmm0 + paddd (4-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 4*16(%eax) + movdqa %xmm7, (4+1)*16(%eax) movdqa -9*16(%eax), %xmm0 movdqa sha256d_4preext2_23, %xmm4 @@ -1104,7 +5972,34 @@ sha256d_ms_4way_main_loop1: jmp sha256d_ms_4way_extend_loop2 sha256d_ms_4way_extend_coda2: - sha256_sse2_extend_round 44 + movdqa (44-15)*16(%eax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (44-16)*16(%eax), %xmm0 + paddd (44-7)*16(%eax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, 44*16(%eax) movdqa sha256_4h+0, %xmm7 movdqa sha256_4h+16, %xmm5 @@ -1121,13 +6016,98 @@ sha256d_ms_4way_extend_coda2: leal 48(%esp), %eax jmp sha256d_ms_4way_main_loop2 -.macro sha256_sse2_main_round_red i, r7 - movdqa 16*(\i)(%eax), %xmm6 - paddd 16*(\i)+sha256_4k, %xmm6 + +sha256d_ms_4way_finish: + movdqa 16*(57)(%eax), %xmm6 + paddd 16*(57)+sha256_4k, %xmm6 paddd 32(%esp), %xmm6 movdqa %xmm0, %xmm1 movdqa 16(%esp), %xmm2 - paddd \r7, %xmm6 + paddd %xmm3, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + movdqa 16*(58)(%eax), %xmm6 + paddd 16*(58)+sha256_4k, %xmm6 + paddd 32(%esp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + paddd %xmm4, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + movdqa 16*(59)(%eax), %xmm6 + paddd 16*(59)+sha256_4k, %xmm6 + paddd 32(%esp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + paddd %xmm5, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 + movdqa 16*(60)(%eax), %xmm6 + paddd 16*(60)+sha256_4k, %xmm6 + paddd 32(%esp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + paddd %xmm7, %xmm6 pandn %xmm2, %xmm1 movdqa %xmm2, 32(%esp) movdqa 0(%esp), %xmm2 @@ -1150,13 +6130,6 @@ sha256d_ms_4way_extend_coda2: pslld $5, %xmm1 pxor %xmm1, %xmm0 paddd %xmm6, %xmm0 -.endm - -sha256d_ms_4way_finish: - sha256_sse2_main_round_red 57, %xmm3 - sha256_sse2_main_round_red 58, %xmm4 - sha256_sse2_main_round_red 59, %xmm5 - sha256_sse2_main_round_red 60, %xmm7 paddd sha256_4h+112, %xmm0 movdqa %xmm0, 112(%edi) diff --git a/sha2-x86.S.orig b/sha2-x86.S.orig new file mode 100644 index 000000000..e2eb112a9 --- /dev/null +++ b/sha2-x86.S.orig @@ -0,0 +1,1193 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(USE_ASM) && defined(__i386__) + + .data + .p2align 7 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_4preext2_15: + .long 0x00000100, 0x00000100, 0x00000100, 0x00000100 +sha256d_4preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_4preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_4preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_4preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + + .text + .p2align 5 + .globl sha256_init_4way + .globl _sha256_init_4way +sha256_init_4way: +_sha256_init_4way: + movl 4(%esp), %edx + movdqa sha256_4h+0, %xmm0 + movdqa sha256_4h+16, %xmm1 + movdqa sha256_4h+32, %xmm2 + movdqa sha256_4h+48, %xmm3 + movdqu %xmm0, 0(%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm3 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + ret + + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%eax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (\i-16)*16(%eax), %xmm0 + paddd (\i-7)*16(%eax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, \i*16(%eax) +.endm + +.macro sha256_sse2_extend_doubleround i + movdqa (\i-15)*16(%eax), %xmm0 + movdqa (\i-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (\i-16)*16(%eax), %xmm0 + paddd (\i-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-7)*16(%eax), %xmm0 + paddd (\i-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, \i*16(%eax) + movdqa %xmm7, (\i+1)*16(%eax) +.endm + +.macro sha256_sse2_main_round i + movdqa 16*(\i)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(\i)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + +.macro sha256_sse2_main_quadround i + sha256_sse2_main_round \i+0 + sha256_sse2_main_round \i+1 + sha256_sse2_main_round \i+2 + sha256_sse2_main_round \i+3 +.endm + + +.macro p2bswap_esi_esp i + movdqu \i*16(%esi), %xmm0 + movdqu (\i+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (\i+3)*16(%esp) + movdqa %xmm2, (\i+4)*16(%esp) +.endm + + .text + .p2align 5 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: + pushl %edi + pushl %esi + movl 12(%esp), %edi + movl 16(%esp), %esi + movl 20(%esp), %ecx + movl %esp, %edx + subl $67*16, %esp + andl $-128, %esp + + testl %ecx, %ecx + jnz sha256_transform_4way_swap + + movdqu 0*16(%esi), %xmm0 + movdqu 1*16(%esi), %xmm1 + movdqu 2*16(%esi), %xmm2 + movdqu 3*16(%esi), %xmm3 + movdqu 4*16(%esi), %xmm4 + movdqu 5*16(%esi), %xmm5 + movdqu 6*16(%esi), %xmm6 + movdqu 7*16(%esi), %xmm7 + movdqa %xmm0, 3*16(%esp) + movdqa %xmm1, 4*16(%esp) + movdqa %xmm2, 5*16(%esp) + movdqa %xmm3, 6*16(%esp) + movdqa %xmm4, 7*16(%esp) + movdqa %xmm5, 8*16(%esp) + movdqa %xmm6, 9*16(%esp) + movdqa %xmm7, 10*16(%esp) + movdqu 8*16(%esi), %xmm0 + movdqu 9*16(%esi), %xmm1 + movdqu 10*16(%esi), %xmm2 + movdqu 11*16(%esi), %xmm3 + movdqu 12*16(%esi), %xmm4 + movdqu 13*16(%esi), %xmm5 + movdqu 14*16(%esi), %xmm6 + movdqu 15*16(%esi), %xmm7 + movdqa %xmm0, 11*16(%esp) + movdqa %xmm1, 12*16(%esp) + movdqa %xmm2, 13*16(%esp) + movdqa %xmm3, 14*16(%esp) + movdqa %xmm4, 15*16(%esp) + movdqa %xmm5, 16*16(%esp) + movdqa %xmm6, 17*16(%esp) + movdqa %xmm7, 18*16(%esp) + jmp sha256_transform_4way_extend + + .p2align 5 +sha256_transform_4way_swap: + p2bswap_esi_esp 0 + p2bswap_esi_esp 2 + p2bswap_esi_esp 4 + p2bswap_esi_esp 6 + p2bswap_esi_esp 8 + p2bswap_esi_esp 10 + p2bswap_esi_esp 12 + p2bswap_esi_esp 14 + +sha256_transform_4way_extend: + leal 19*16(%esp), %ecx + leal 48*16(%ecx), %eax + movdqa -2*16(%ecx), %xmm3 + movdqa -1*16(%ecx), %xmm7 +sha256_transform_4way_extend_loop: + movdqa -15*16(%ecx), %xmm0 + movdqa -14*16(%ecx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd -16*16(%ecx), %xmm0 + paddd -15*16(%ecx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -7*16(%ecx), %xmm0 + paddd -6*16(%ecx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, (%ecx) + movdqa %xmm7, 16(%ecx) + addl $2*16, %ecx + cmpl %ecx, %eax + jne sha256_transform_4way_extend_loop + + movdqu 0(%edi), %xmm7 + movdqu 16(%edi), %xmm5 + movdqu 32(%edi), %xmm4 + movdqu 48(%edi), %xmm3 + movdqu 64(%edi), %xmm0 + movdqu 80(%edi), %xmm1 + movdqu 96(%edi), %xmm2 + movdqu 112(%edi), %xmm6 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + xorl %eax, %eax +sha256_transform_4way_main_loop: + movdqa 3*16(%esp, %eax), %xmm6 + paddd sha256_4k(%eax), %xmm6 + paddd 32(%esp), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addl $16, %eax + cmpl $16*64, %eax + jne sha256_transform_4way_main_loop + + movdqu 0(%edi), %xmm1 + movdqu 16(%edi), %xmm2 + paddd %xmm1, %xmm7 + paddd %xmm2, %xmm5 + movdqu 32(%edi), %xmm1 + movdqu 48(%edi), %xmm2 + paddd %xmm1, %xmm4 + paddd %xmm2, %xmm3 + + movdqu %xmm7, 0(%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm4, 32(%edi) + movdqu %xmm3, 48(%edi) + + movdqu 64(%edi), %xmm1 + movdqu 80(%edi), %xmm2 + movdqu 96(%edi), %xmm6 + movdqu 112(%edi), %xmm7 + paddd %xmm1, %xmm0 + paddd 0(%esp), %xmm2 + paddd 16(%esp), %xmm6 + paddd 32(%esp), %xmm7 + + movdqu %xmm0, 64(%edi) + movdqu %xmm2, 80(%edi) + movdqu %xmm6, 96(%edi) + movdqu %xmm7, 112(%edi) + + movl %edx, %esp + popl %esi + popl %edi + ret + + + .text + .p2align 5 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + pushl %edi + pushl %esi + pushl %ebp + movl 16(%esp), %edi + movl 20(%esp), %esi + movl 24(%esp), %edx + movl 28(%esp), %ecx + movl %esp, %ebp + subl $67*16, %esp + andl $-128, %esp + + leal 256(%esi), %eax + +sha256d_ms_4way_extend_loop1: + movdqa 3*16(%esi), %xmm0 + movdqa 2*16(%eax), %xmm3 + movdqa 3*16(%eax), %xmm7 + movdqa %xmm3, 5*16(%esp) + movdqa %xmm7, 6*16(%esp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%eax) + movdqa %xmm7, 3*16(%eax) + + movdqa 4*16(%eax), %xmm0 + movdqa %xmm0, 7*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%eax) + movdqa %xmm7, 5*16(%eax) + + movdqa 6*16(%eax), %xmm0 + movdqa 7*16(%eax), %xmm4 + movdqa %xmm0, 9*16(%esp) + movdqa %xmm4, 10*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%eax) + movdqa %xmm7, 7*16(%eax) + + movdqa 8*16(%eax), %xmm0 + movdqa 2*16(%eax), %xmm4 + movdqa %xmm0, 11*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%eax) + movdqa %xmm7, 9*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%eax), %xmm3 + paddd 4*16(%eax), %xmm7 + movdqa %xmm3, 10*16(%eax) + movdqa %xmm7, 11*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%eax), %xmm3 + paddd 6*16(%eax), %xmm7 + movdqa %xmm3, 12*16(%eax) + movdqa %xmm7, 13*16(%eax) + + movdqa 14*16(%eax), %xmm0 + movdqa 15*16(%eax), %xmm4 + movdqa %xmm0, 17*16(%esp) + movdqa %xmm4, 18*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%eax), %xmm0 + paddd 8*16(%eax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%eax) + movdqa %xmm7, 15*16(%eax) + +sha256d_ms_4way_extend_loop2: + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 + + movdqa 0(%ecx), %xmm3 + movdqa 16(%ecx), %xmm0 + movdqa 32(%ecx), %xmm1 + movdqa 48(%ecx), %xmm2 + movdqa 64(%ecx), %xmm6 + movdqa 80(%ecx), %xmm7 + movdqa 96(%ecx), %xmm5 + movdqa 112(%ecx), %xmm4 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + movl %esi, %eax + jmp sha256d_ms_4way_main_loop1 + +sha256d_ms_4way_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_quadround 4 + sha256_sse2_main_quadround 8 + sha256_sse2_main_quadround 12 + sha256_sse2_main_quadround 16 + sha256_sse2_main_quadround 20 + sha256_sse2_main_quadround 24 + sha256_sse2_main_quadround 28 + sha256_sse2_main_quadround 32 + sha256_sse2_main_quadround 36 + sha256_sse2_main_quadround 40 + sha256_sse2_main_quadround 44 + sha256_sse2_main_quadround 48 + sha256_sse2_main_quadround 52 + sha256_sse2_main_round 56 + jz sha256d_ms_4way_finish + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_quadround 60 + + movdqa 5*16(%esp), %xmm1 + movdqa 6*16(%esp), %xmm2 + movdqa 7*16(%esp), %xmm6 + movdqa %xmm1, 18*16(%esi) + movdqa %xmm2, 19*16(%esi) + movdqa %xmm6, 20*16(%esi) + movdqa 9*16(%esp), %xmm1 + movdqa 10*16(%esp), %xmm2 + movdqa 11*16(%esp), %xmm6 + movdqa %xmm1, 22*16(%esi) + movdqa %xmm2, 23*16(%esi) + movdqa %xmm6, 24*16(%esi) + movdqa 17*16(%esp), %xmm1 + movdqa 18*16(%esp), %xmm2 + movdqa %xmm1, 30*16(%esi) + movdqa %xmm2, 31*16(%esi) + + movdqa 0(%esp), %xmm1 + movdqa 16(%esp), %xmm2 + movdqa 32(%esp), %xmm6 + paddd 0(%edx), %xmm7 + paddd 16(%edx), %xmm5 + paddd 32(%edx), %xmm4 + paddd 48(%edx), %xmm3 + paddd 64(%edx), %xmm0 + paddd 80(%edx), %xmm1 + paddd 96(%edx), %xmm2 + paddd 112(%edx), %xmm6 + + movdqa %xmm7, 48+0(%esp) + movdqa %xmm5, 48+16(%esp) + movdqa %xmm4, 48+32(%esp) + movdqa %xmm3, 48+48(%esp) + movdqa %xmm0, 48+64(%esp) + movdqa %xmm1, 48+80(%esp) + movdqa %xmm2, 48+96(%esp) + movdqa %xmm6, 48+112(%esp) + + movdqa sha256d_4preext2_15, %xmm1 + movdqa sha256d_4preext2_24, %xmm2 + pxor %xmm0, %xmm0 + movdqa %xmm2, 48+128(%esp) + movdqa %xmm0, 48+144(%esp) + movdqa %xmm0, 48+160(%esp) + movdqa %xmm0, 48+176(%esp) + movdqa %xmm0, 48+192(%esp) + movdqa %xmm0, 48+208(%esp) + movdqa %xmm0, 48+224(%esp) + movdqa %xmm1, 48+240(%esp) + + leal 19*16(%esp), %eax + cmpl %eax, %eax + + movdqa -15*16(%eax), %xmm0 + movdqa -14*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%eax), %xmm0 + paddd -15*16(%eax), %xmm4 + paddd sha256d_4preext2_17, %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%eax) + movdqa %xmm7, 1*16(%eax) + + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + + movdqa -9*16(%eax), %xmm0 + movdqa sha256d_4preext2_23, %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%eax), %xmm0 + paddd -9*16(%eax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 0*16(%eax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%eax) + movdqa %xmm7, 7*16(%eax) + + movdqa sha256d_4preext2_24, %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%eax), %xmm7 + movdqa %xmm3, 8*16(%eax) + movdqa %xmm7, 9*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%eax), %xmm3 + paddd 4*16(%eax), %xmm7 + movdqa %xmm3, 10*16(%eax) + movdqa %xmm7, 11*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%eax), %xmm3 + paddd 6*16(%eax), %xmm7 + movdqa %xmm3, 12*16(%eax) + movdqa %xmm7, 13*16(%eax) + + movdqa sha256d_4preext2_30, %xmm0 + movdqa 0*16(%eax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%eax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%eax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%eax) + movdqa %xmm7, 15*16(%eax) + + jmp sha256d_ms_4way_extend_loop2 + +sha256d_ms_4way_extend_coda2: + sha256_sse2_extend_round 44 + + movdqa sha256_4h+0, %xmm7 + movdqa sha256_4h+16, %xmm5 + movdqa sha256_4h+32, %xmm4 + movdqa sha256_4h+48, %xmm3 + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm6 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + leal 48(%esp), %eax + jmp sha256d_ms_4way_main_loop2 + +.macro sha256_sse2_main_round_red i, r7 + movdqa 16*(\i)(%eax), %xmm6 + paddd 16*(\i)+sha256_4k, %xmm6 + paddd 32(%esp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + paddd \r7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 +.endm + +sha256d_ms_4way_finish: + sha256_sse2_main_round_red 57, %xmm3 + sha256_sse2_main_round_red 58, %xmm4 + sha256_sse2_main_round_red 59, %xmm5 + sha256_sse2_main_round_red 60, %xmm7 + + paddd sha256_4h+112, %xmm0 + movdqa %xmm0, 112(%edi) + + movl %ebp, %esp + popl %ebp + popl %esi + popl %edi + ret + + + .text + .p2align 5 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushl %ebx + + /* Check for SSE2 availability */ + movl $1, %eax + cpuid + andl $0x04000000, %edx + jnz sha256_use_4way_sse2 + xorl %eax, %eax + popl %ebx + ret + +sha256_use_4way_sse2: + movl $1, %eax + popl %ebx + ret + +#endif diff --git a/sha2.c b/sha2.c index 9447abb45..0a1070bd8 100644 --- a/sha2.c +++ b/sha2.c @@ -14,36 +14,34 @@ #include #include -#if defined(USE_ASM) && \ - (defined(__x86_64__) || \ - (defined(__arm__) && defined(__APCS_32__)) || \ - (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))) +#if defined(USE_ASM) && \ + (defined(__x86_64__) || \ + (defined(__arm__) && defined(__APCS_32__)) || \ + (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))) #define EXTERN_SHA256 #endif static const uint32_t sha256_h[8] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; static const uint32_t sha256_k[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; void sha256_init(uint32_t *state) { @@ -51,30 +49,31 @@ void sha256_init(uint32_t *state) } /* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define ROTR(x, n) ((x >> n) | (x << (32 - n))) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) /* SHA256 round function */ #define RND(a, b, c, d, e, f, g, h, k) \ - do { \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; \ + do \ + { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ } while (0) /* Adjusted round function for rotating state */ -#define RNDr(S, W, i) \ +#define RNDr(S, W, i) \ RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + sha256_k[i]) + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) #ifndef EXTERN_SHA256 @@ -90,30 +89,33 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap) int i; /* 1. Prepare message schedule W. */ - if (swap) { + if (swap) + { for (i = 0; i < 16; i++) W[i] = swab32(block[i]); - } else + } + else memcpy(W, block, 64); - for (i = 16; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + for (i = 16; i < 64; i += 2) + { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } /* 2. Initialize working variables. */ memcpy(S, state, 32); /* 3. Mix. */ - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); RNDr(S, W, 10); RNDr(S, W, 11); RNDr(S, W, 12); @@ -176,13 +178,11 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap) #endif /* EXTERN_SHA256 */ - static const uint32_t sha256d_hash1[16] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x80000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000100 -}; + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100}; static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) { @@ -205,7 +205,8 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len) int i, r; sha256_init(S); - for (r = len; r > -9; r -= 64) { + for (r = len; r > -9; r -= 64) + { if (r < 64) memset(T, 0, 64); memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); @@ -226,22 +227,22 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len) static inline void sha256d_preextend(uint32_t *W) { - W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; - W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; - W[18] = s1(W[16]) + W[11] + W[ 2]; - W[19] = s1(W[17]) + W[12] + s0(W[ 4]); - W[20] = W[13] + s0(W[ 5]) + W[ 4]; - W[21] = W[14] + s0(W[ 6]) + W[ 5]; - W[22] = W[15] + s0(W[ 7]) + W[ 6]; - W[23] = W[16] + s0(W[ 8]) + W[ 7]; - W[24] = W[17] + s0(W[ 9]) + W[ 8]; - W[25] = s0(W[10]) + W[ 9]; - W[26] = s0(W[11]) + W[10]; - W[27] = s0(W[12]) + W[11]; - W[28] = s0(W[13]) + W[12]; - W[29] = s0(W[14]) + W[13]; - W[30] = s0(W[15]) + W[14]; - W[31] = s0(W[16]) + W[15]; + W[16] = s1(W[14]) + W[9] + s0(W[1]) + W[0]; + W[17] = s1(W[15]) + W[10] + s0(W[2]) + W[1]; + W[18] = s1(W[16]) + W[11] + W[2]; + W[19] = s1(W[17]) + W[12] + s0(W[4]); + W[20] = W[13] + s0(W[5]) + W[4]; + W[21] = W[14] + s0(W[6]) + W[5]; + W[22] = W[15] + s0(W[7]) + W[6]; + W[23] = W[16] + s0(W[8]) + W[7]; + W[24] = W[17] + s0(W[9]) + W[8]; + W[25] = s0(W[10]) + W[9]; + W[26] = s0(W[11]) + W[10]; + W[27] = s0(W[12]) + W[11]; + W[28] = s0(W[13]) + W[12]; + W[29] = s0(W[14]) + W[13]; + W[30] = s0(W[15]) + W[14]; + W[31] = s0(W[16]) + W[15]; } static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) @@ -255,12 +256,12 @@ static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) #ifdef EXTERN_SHA256 void sha256d_ms(uint32_t *hash, uint32_t *W, - const uint32_t *midstate, const uint32_t *prehash); + const uint32_t *midstate, const uint32_t *prehash); #else static inline void sha256d_ms(uint32_t *hash, uint32_t *W, - const uint32_t *midstate, const uint32_t *prehash) + const uint32_t *midstate, const uint32_t *prehash) { uint32_t S[64]; uint32_t t0, t1; @@ -278,31 +279,32 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, W[18] += s0(W[3]); W[19] += W[3]; W[20] += s1(W[18]); - W[21] = s1(W[19]); + W[21] = s1(W[19]); W[22] += s1(W[20]); W[23] += s1(W[21]); W[24] += s1(W[22]); - W[25] = s1(W[23]) + W[18]; - W[26] = s1(W[24]) + W[19]; - W[27] = s1(W[25]) + W[20]; - W[28] = s1(W[26]) + W[21]; - W[29] = s1(W[27]) + W[22]; + W[25] = s1(W[23]) + W[18]; + W[26] = s1(W[24]) + W[19]; + W[27] = s1(W[25]) + W[20]; + W[28] = s1(W[26]) + W[21]; + W[29] = s1(W[27]) + W[22]; W[30] += s1(W[28]) + W[23]; W[31] += s1(W[29]) + W[24]; - for (i = 32; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + for (i = 32; i < 64; i += 2) + { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } memcpy(S, prehash, 32); - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); RNDr(S, W, 10); RNDr(S, W, 11); RNDr(S, W, 12); @@ -360,7 +362,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, for (i = 0; i < 8; i++) S[i] += midstate[i]; - + W[18] = S[18]; W[19] = S[19]; W[20] = S[20]; @@ -369,42 +371,43 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, W[24] = S[24]; W[30] = S[30]; W[31] = S[31]; - + memcpy(S + 8, sha256d_hash1 + 8, 32); - S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; - S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; - S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; - S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; - S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; - S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; - S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; - S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; - S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; - S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; + S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[9] + s0(S[1]) + S[0]; + S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[2]) + S[1]; + S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[3]) + S[2]; + S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[4]) + S[3]; + S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[5]) + S[4]; + S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[6]) + S[5]; + S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[7]) + S[6]; + S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[8]) + S[7]; + S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[9]) + sha256d_hash1[8]; + S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[9]; S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; - S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; - for (i = 32; i < 60; i += 2) { - S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; - S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; + S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; + for (i = 32; i < 60; i += 2) + { + S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; + S[i + 1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; } S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; sha256_init(hash); - RNDr(hash, S, 0); - RNDr(hash, S, 1); - RNDr(hash, S, 2); - RNDr(hash, S, 3); - RNDr(hash, S, 4); - RNDr(hash, S, 5); - RNDr(hash, S, 6); - RNDr(hash, S, 7); - RNDr(hash, S, 8); - RNDr(hash, S, 9); + RNDr(hash, S, 0); + RNDr(hash, S, 1); + RNDr(hash, S, 2); + RNDr(hash, S, 3); + RNDr(hash, S, 4); + RNDr(hash, S, 5); + RNDr(hash, S, 6); + RNDr(hash, S, 7); + RNDr(hash, S, 8); + RNDr(hash, S, 9); RNDr(hash, S, 10); RNDr(hash, S, 11); RNDr(hash, S, 12); @@ -452,27 +455,22 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, RNDr(hash, S, 54); RNDr(hash, S, 55); RNDr(hash, S, 56); - - hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) - + S[57] + sha256_k[57]; - hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) - + S[58] + sha256_k[58]; - hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) - + S[59] + sha256_k[59]; - hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) - + S[60] + sha256_k[60] - + sha256_h[7]; + + hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) + S[57] + sha256_k[57]; + hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) + S[58] + sha256_k[58]; + hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) + S[59] + sha256_k[59]; + hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) + S[60] + sha256_k[60] + sha256_h[7]; } #endif /* EXTERN_SHA256 */ #ifdef HAVE_SHA256_4WAY -void sha256d_ms_4way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); +void sha256d_ms_4way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t data[4 * 64] __attribute__((aligned(128))); uint32_t hash[4 * 8] __attribute__((aligned(32))); @@ -482,42 +480,48 @@ static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; int i, j; - + memcpy(data, pdata + 16, 64); sha256d_preextend(data); for (i = 31; i >= 0; i--) for (j = 0; j < 4; j++) data[i * 4 + j] = data[i]; - + sha256_init(midstate); sha256_transform(midstate, pdata, 0); memcpy(prehash, midstate, 32); sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 4; j++) { + for (i = 7; i >= 0; i--) + { + for (j = 0; j < 4; j++) + { midstate[i * 4 + j] = midstate[i]; prehash[i * 4 + j] = prehash[i]; } } - - do { + + do + { for (i = 0; i < 4; i++) data[4 * 3 + i] = ++n; - + sha256d_ms_4way(hash, data, midstate, prehash); - - for (i = 0; i < 4; i++) { - if (swab32(hash[4 * 7 + i]) <= Htarg) { + + for (i = 0; i < 4; i++) + { + if (swab32(hash[4 * 7 + i]) <= Htarg) + { pdata[19] = data[4 * 3 + i]; sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { + if (fulltest(hash, ptarget)) + { *hashes_done = n - first_nonce + 1; return 1; } } } } while (n < max_nonce && !work_restart[thr_id].restart); - + *hashes_done = n - first_nonce + 1; pdata[19] = n; return 0; @@ -527,11 +531,11 @@ static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, #ifdef HAVE_SHA256_8WAY -void sha256d_ms_8way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); +void sha256d_ms_8way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t data[8 * 64] __attribute__((aligned(128))); uint32_t hash[8 * 8] __attribute__((aligned(32))); @@ -541,42 +545,48 @@ static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; int i, j; - + memcpy(data, pdata + 16, 64); sha256d_preextend(data); for (i = 31; i >= 0; i--) for (j = 0; j < 8; j++) data[i * 8 + j] = data[i]; - + sha256_init(midstate); sha256_transform(midstate, pdata, 0); memcpy(prehash, midstate, 32); sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 8; j++) { + for (i = 7; i >= 0; i--) + { + for (j = 0; j < 8; j++) + { midstate[i * 8 + j] = midstate[i]; prehash[i * 8 + j] = prehash[i]; } } - - do { + + do + { for (i = 0; i < 8; i++) data[8 * 3 + i] = ++n; - + sha256d_ms_8way(hash, data, midstate, prehash); - - for (i = 0; i < 8; i++) { - if (swab32(hash[8 * 7 + i]) <= Htarg) { + + for (i = 0; i < 8; i++) + { + if (swab32(hash[8 * 7 + i]) <= Htarg) + { pdata[19] = data[8 * 3 + i]; sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { + if (fulltest(hash, ptarget)) + { *hashes_done = n - first_nonce + 1; return 1; } } } } while (n < max_nonce && !work_restart[thr_id].restart); - + *hashes_done = n - first_nonce + 1; pdata[19] = n; return 0; @@ -585,7 +595,7 @@ static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, #endif /* HAVE_SHA256_8WAY */ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) + uint32_t max_nonce, unsigned long *hashes_done) { uint32_t data[64] __attribute__((aligned(128))); uint32_t hash[8] __attribute__((aligned(32))); @@ -594,40 +604,133 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; - + #ifdef HAVE_SHA256_8WAY if (sha256_use_8way()) return scanhash_sha256d_8way(thr_id, pdata, ptarget, - max_nonce, hashes_done); + max_nonce, hashes_done); #endif #ifdef HAVE_SHA256_4WAY if (sha256_use_4way()) return scanhash_sha256d_4way(thr_id, pdata, ptarget, - max_nonce, hashes_done); + max_nonce, hashes_done); #endif - + memcpy(data, pdata + 16, 64); sha256d_preextend(data); - + sha256_init(midstate); sha256_transform(midstate, pdata, 0); memcpy(prehash, midstate, 32); sha256d_prehash(prehash, pdata + 16); - - do { + + do + { data[3] = ++n; sha256d_ms(hash, data, midstate, prehash); - if (swab32(hash[7]) <= Htarg) { + if (swab32(hash[7]) <= Htarg) + { pdata[19] = data[3]; sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { + if (fulltest(hash, ptarget)) + { *hashes_done = n - first_nonce + 1; return 1; } } } while (n < max_nonce && !work_restart[thr_id].restart); - + *hashes_done = n - first_nonce + 1; pdata[19] = n; return 0; } + +int scanhash_sha256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[64] __attribute__((aligned(128))); + uint32_t hash[8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + +#ifdef HAVE_SHA256_8WAY + if (sha256_use_8way()) + return scanhash_sha256d_8way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif +#ifdef HAVE_SHA256_4WAY + if (sha256_use_4way()) + return scanhash_sha256d_4way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif + + memcpy(data, pdata, 64); // Prepare the initial data block + + do + { + data[19] = ++n; // Update nonce in the data array + sha256_init(hash); // Initialize SHA256 state + sha256_transform(hash, data, 0); // Perform the single SHA256 + + // Check if the hash is below the target + if (swab32(hash[7]) <= Htarg) + { + pdata[19] = data[19]; // Store the valid nonce + if (fulltest(hash, ptarget)) + { // Verify full PoW target + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +int scanhash_sha256ET10(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[64] __attribute__((aligned(128))); + uint32_t hash[8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7] * 10; + +#ifdef HAVE_SHA256_8WAY + if (sha256_use_8way()) + return scanhash_sha256d_8way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif +#ifdef HAVE_SHA256_4WAY + if (sha256_use_4way()) + return scanhash_sha256d_4way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif + + memcpy(data, pdata, 64); // Prepare the initial data block + + do + { + data[19] = ++n; // Update nonce in the data array + sha256_init(hash); // Initialize SHA256 state + sha256_transform(hash, data, 0); // Perform the single SHA256 + + // Check if the hash is below the scaled target + if (swab32(hash[7]) <= Htarg) + { + pdata[19] = data[19]; // Store the valid nonce + if (fulltest(hash, ptarget)) + { // Verify full PoW target + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} \ No newline at end of file