From 2a30fc3932dbb19a69f18dc4516caa2f36440578 Mon Sep 17 00:00:00 2001 From: Kamila Szewczyk Date: Mon, 2 May 2022 07:56:03 +0200 Subject: [PATCH] replace the move-to-front transform --- .vscode/settings.json | 5 ++ Makefile | 2 +- main.c | 16 +++-- mtf.h | 52 -------------- srt.h | 155 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 170 insertions(+), 60 deletions(-) create mode 100644 .vscode/settings.json delete mode 100644 mtf.h create mode 100644 srt.h diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..926b26e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "files.associations": { + "srt.h": "c" + } +} \ No newline at end of file diff --git a/Makefile b/Makefile index a0d20f3..9d36a25 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ CFLAGS=-O3 -march=native -mtune=native -flto all: bzip3 -%.o: %.c mtf.h rle.h crc32.h cm.h +%.o: %.c srt.h rle.h crc32.h cm.h $(CC) $(CFLAGS) -c $< -o $@ bzip3: main.o libsais.o diff --git a/main.c b/main.c index c34b19e..e99fbd1 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "libsais.h" #include "rle.h" -#include "mtf.h" +#include "srt.h" #include "crc32.h" #include "cm.h" @@ -67,7 +67,7 @@ int main(int argc, char *argv[]) { output_des = STDOUT_FILENO; } - struct mtf_state mtf_state; + struct srt_state mtf_state; if (mode == 1) { // Encode @@ -87,12 +87,12 @@ int main(int argc, char *argv[]) { int32_t new_size = mrlec(buffer, bytes_read, output); int32_t bwt_index = libsais_bwt(output, output, sais_array, new_size, 16, NULL); - mtf_encode(&mtf_state, output, buffer, new_size); + int32_t new_size2 = srt_encode(&mtf_state, output, buffer, new_size); begin(&s); s.out_queue = output; s.output_ptr = 0; - for (int32_t i = 0; i < new_size; i++) encode_bit(&s, buffer[i]); + for (int32_t i = 0; i < new_size2; i++) encode_bit(&s, buffer[i]); flush(&s); int32_t new_size3 = s.output_ptr; @@ -100,6 +100,7 @@ int main(int argc, char *argv[]) { write(output_des, &bytes_read, sizeof(int32_t)); write(output_des, &bwt_index, sizeof(int32_t)); write(output_des, &new_size, sizeof(int32_t)); + write(output_des, &new_size2, sizeof(int32_t)); write(output_des, &new_size3, sizeof(int32_t)); write(output_des, output, new_size3); } @@ -127,12 +128,13 @@ int main(int argc, char *argv[]) { if (read(fd, buf, size) != size) break; uint32_t crc32; - int32_t bytes_read, bwt_index, new_size, new_size3; + int32_t bytes_read, bwt_index, new_size, new_size2, new_size3; safe_read(input_des, &crc32, sizeof(uint32_t)); safe_read(input_des, &bytes_read, sizeof(int32_t)); safe_read(input_des, &bwt_index, sizeof(int32_t)); safe_read(input_des, &new_size, sizeof(int32_t)); + safe_read(input_des, &new_size2, sizeof(int32_t)); safe_read(input_des, &new_size3, sizeof(int32_t)); safe_read(input_des, buffer, new_size3); @@ -141,8 +143,8 @@ int main(int argc, char *argv[]) { s.input_ptr = 0; s.input_max = new_size3; init(&s); - for (int32_t i = 0; i < new_size; i++) output[i] = decode_bit(&s); - mtf_decode(&mtf_state, output, buffer, new_size); + for (int32_t i = 0; i < new_size2; i++) output[i] = decode_bit(&s); + srt_decode(&mtf_state, output, buffer, new_size2); libsais_unbwt(buffer, output, sais_array, new_size, NULL, bwt_index); mrled(output, buffer, bytes_read); diff --git a/mtf.h b/mtf.h deleted file mode 100644 index eddcf6a..0000000 --- a/mtf.h +++ /dev/null @@ -1,52 +0,0 @@ - -#ifndef _MTF_H -#define _MTF_H - -struct mtf_state { - uint32_t prev[256], curr[256], symbols[256], ranks[256]; -}; - -void mtf_encode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) { - for (uint32_t i = 0; i < 256; i++) { - mtf->prev[i] = mtf->curr[i] = 0; - mtf->symbols[i] = mtf->ranks[i] = i; - } - - for (uint32_t i = 0; i < count; i++) { - uint32_t r = mtf->symbols[src[i]]; - dst[i] = r; - - mtf->prev[src[i]] = mtf->curr[src[i]] = i; - - for (; r > 0 && mtf->curr[mtf->ranks[r - 1]] <= i; r--) { - mtf->ranks[r] = mtf->ranks[r - 1]; - mtf->symbols[mtf->ranks[r]] = r; - } - - mtf->ranks[r] = src[i]; - mtf->symbols[src[i]] = r; - } -} - -void mtf_decode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) { - for (uint32_t i = 0; i < 256; i++) { - mtf->prev[i] = mtf->curr[i] = 0; - mtf->ranks[i] = i; - } - - for (uint32_t i = 0; i < count; i++) { - uint32_t r = src[i] & 0xFF; - - const uint32_t c = mtf->ranks[r]; - dst[i] = (int8_t)c; - - mtf->prev[c] = mtf->curr[c] = i; - - for (; r > 0 && mtf->curr[mtf->ranks[r - 1]] <= i; r--) - mtf->ranks[r] = mtf->ranks[r - 1]; - - mtf->ranks[r] = c; - } -} - -#endif diff --git a/srt.h b/srt.h new file mode 100644 index 0000000..bcf8bef --- /dev/null +++ b/srt.h @@ -0,0 +1,155 @@ + +#ifndef _MTF_H +#define _MTF_H + +static const int MAX_HDR_SIZE = 4 * 256; + +struct srt_state { + uint32_t freqs[256]; + uint8_t symbols[256]; + uint32_t r2s[256]; + uint32_t s2r[256]; + uint32_t buckets[256]; + uint32_t bucket_ends[256]; +}; + +static int preprocess(const uint32_t * freqs, uint8_t * symbols) { + int nb_symbols = 0; + for(int i = 0; i < 256; i++) + if(freqs[i] > 0) + symbols[nb_symbols++] = i; + uint32_t h = 4; + while(h < nb_symbols) + h = h * 3 + 1; + while(1) { + h /= 3; + for(uint32_t i = h; i < nb_symbols; i++) { + const int t = symbols[i] & 0xFF; + int32_t b = i - h; + while((b >= 0) && freqs[symbols[b]] < freqs[t] + || (freqs[t] == freqs[symbols[b]]) && t < symbols[b]) + { symbols[b + h] = symbols[b]; b -= h; } + symbols[b + h] = t; + } + if(h == 1) + break; + } + return nb_symbols; +} + +static int encode_header(uint32_t * freqs, uint8_t * dst) { + uint32_t idx = 0; + for(int i = 0; i < 256; i++) { + uint32_t f = freqs[i]; + while(f >= 128) { + dst[idx++] = (uint8_t) (f | 0x80); + f >>= 7; + } + dst[idx++] = (uint8_t) f; + } + return idx; +} + +static int decode_header(uint8_t * src, uint32_t * freqs) { + uint32_t idx = 0; + for(int i = 0; i < 256; i++) { + int val = src[idx++] & 0xFF; + int res = val & 0x7F; + int shift = 7; + while(val >= 128) { + val = src[idx++] & 0xFF; + res |= (val & 0x7F) << shift; + if(shift > 21) + break; + shift += 7; + } + freqs[i] = res; + } + return idx; +} + +uint32_t srt_encode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) { + // Find first symbols and build a histogram. + for(int i = 0; i < 256; i++) + mtf->freqs[i] = 0; + for(uint32_t i = 0, b = 0; i < count;) { + if(mtf->freqs[src[i]] == 0) { + mtf->r2s[b] = src[i]; + mtf->s2r[src[i]] = b; + b++; + } + uint32_t j = i + 1; + while(j < count && src[j] == src[i]) + j++; + mtf->freqs[src[i]] += j - i; + i = j; + } + + int n_symbols = preprocess(mtf->freqs, mtf->symbols); + for(uint32_t i = 0, bucket_pos = 0; i < n_symbols; i++) { + mtf->buckets[mtf->symbols[i]] = bucket_pos; + bucket_pos += mtf->freqs[mtf->symbols[i]]; + } + + const uint32_t header_size = encode_header(mtf->freqs, dst); + const int dst_idx = header_size; + for(uint32_t i = 0; i < count; ) { + const int c = src[i] & 0xFF; + int r = mtf->s2r[c] & 0xFF; + uint32_t p = mtf->buckets[c]; + dst[dst_idx + p++] = r; + if(r != 0) { + do { + mtf->r2s[r] = mtf->r2s[r - 1]; + mtf->s2r[mtf->r2s[r]] = r; + r--; + } while(r != 0); + mtf->r2s[0] = c; + mtf->s2r[c] = 0; + } + i++; + while(i < count && src[i] == c) { + dst[dst_idx + p++] = 0; + i++; + } + mtf->buckets[c] = p; + } + return count + header_size; +} + +uint32_t srt_decode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) { + const uint32_t header_size = decode_header(src, mtf->freqs); + const uint32_t src_idx = header_size; + int nb_symbols = preprocess(mtf->freqs, mtf->symbols); + for(uint32_t i = 0, bucket_pos = 0; i < nb_symbols; i++) { + const int c = mtf->symbols[i] & 0xFF; + mtf->r2s[src[src_idx + bucket_pos] & 0xFF] = c; + mtf->buckets[c] = bucket_pos + 1; + bucket_pos += mtf->freqs[c]; + mtf->bucket_ends[c] = bucket_pos; + } + uint32_t c = mtf->r2s[0]; + for(uint32_t i = 0; i < count; i++) { + dst[i] = c; + if(mtf->buckets[c] < mtf->bucket_ends[c]) { + const int r = src[src_idx + mtf->buckets[c]] & 0xFF; + mtf->buckets[c]++; + if(r == 0) + continue; + for(int s = 0; s < r; s++) + mtf->r2s[s] = mtf->r2s[s + 1]; + mtf->r2s[r] = c; + c = mtf->r2s[0]; + } else { + if(nb_symbols == 1) + continue; + nb_symbols--; + for(int s = 0; s < nb_symbols; s++) + mtf->r2s[s] = mtf->r2s[s + 1]; + c = mtf->r2s[0]; + } + } + return count - header_size; +} + +#endif