Skip to content

Commit

Permalink
replace the move-to-front transform
Browse files Browse the repository at this point in the history
  • Loading branch information
kspalaiologos committed May 2, 2022
1 parent eebd6cd commit 2a30fc3
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 60 deletions.
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"files.associations": {
"srt.h": "c"
}
}
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ CFLAGS=-O3 -march=native -mtune=native -flto

all: bzip3

%.o: %.c mtf.h rle.h crc32.h cm.h
%.o: %.c srt.h rle.h crc32.h cm.h
$(CC) $(CFLAGS) -c $< -o $@

bzip3: main.o libsais.o
Expand Down
16 changes: 9 additions & 7 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#include "libsais.h"
#include "rle.h"
#include "mtf.h"
#include "srt.h"
#include "crc32.h"
#include "cm.h"

Expand Down Expand Up @@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
output_des = STDOUT_FILENO;
}

struct mtf_state mtf_state;
struct srt_state mtf_state;

if (mode == 1) {
// Encode
Expand All @@ -87,19 +87,20 @@ int main(int argc, char *argv[]) {
int32_t new_size = mrlec(buffer, bytes_read, output);
int32_t bwt_index =
libsais_bwt(output, output, sais_array, new_size, 16, NULL);
mtf_encode(&mtf_state, output, buffer, new_size);
int32_t new_size2 = srt_encode(&mtf_state, output, buffer, new_size);

begin(&s);
s.out_queue = output;
s.output_ptr = 0;
for (int32_t i = 0; i < new_size; i++) encode_bit(&s, buffer[i]);
for (int32_t i = 0; i < new_size2; i++) encode_bit(&s, buffer[i]);
flush(&s);
int32_t new_size3 = s.output_ptr;

write(output_des, &crc32, sizeof(uint32_t));
write(output_des, &bytes_read, sizeof(int32_t));
write(output_des, &bwt_index, sizeof(int32_t));
write(output_des, &new_size, sizeof(int32_t));
write(output_des, &new_size2, sizeof(int32_t));
write(output_des, &new_size3, sizeof(int32_t));
write(output_des, output, new_size3);
}
Expand Down Expand Up @@ -127,12 +128,13 @@ int main(int argc, char *argv[]) {
if (read(fd, buf, size) != size) break;

uint32_t crc32;
int32_t bytes_read, bwt_index, new_size, new_size3;
int32_t bytes_read, bwt_index, new_size, new_size2, new_size3;

safe_read(input_des, &crc32, sizeof(uint32_t));
safe_read(input_des, &bytes_read, sizeof(int32_t));
safe_read(input_des, &bwt_index, sizeof(int32_t));
safe_read(input_des, &new_size, sizeof(int32_t));
safe_read(input_des, &new_size2, sizeof(int32_t));
safe_read(input_des, &new_size3, sizeof(int32_t));
safe_read(input_des, buffer, new_size3);

Expand All @@ -141,8 +143,8 @@ int main(int argc, char *argv[]) {
s.input_ptr = 0;
s.input_max = new_size3;
init(&s);
for (int32_t i = 0; i < new_size; i++) output[i] = decode_bit(&s);
mtf_decode(&mtf_state, output, buffer, new_size);
for (int32_t i = 0; i < new_size2; i++) output[i] = decode_bit(&s);
srt_decode(&mtf_state, output, buffer, new_size2);
libsais_unbwt(buffer, output, sais_array, new_size, NULL,
bwt_index);
mrled(output, buffer, bytes_read);
Expand Down
52 changes: 0 additions & 52 deletions mtf.h

This file was deleted.

155 changes: 155 additions & 0 deletions srt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@

#ifndef _MTF_H
#define _MTF_H

static const int MAX_HDR_SIZE = 4 * 256;

struct srt_state {
uint32_t freqs[256];
uint8_t symbols[256];
uint32_t r2s[256];
uint32_t s2r[256];
uint32_t buckets[256];
uint32_t bucket_ends[256];
};

static int preprocess(const uint32_t * freqs, uint8_t * symbols) {
int nb_symbols = 0;
for(int i = 0; i < 256; i++)
if(freqs[i] > 0)
symbols[nb_symbols++] = i;
uint32_t h = 4;
while(h < nb_symbols)
h = h * 3 + 1;
while(1) {
h /= 3;
for(uint32_t i = h; i < nb_symbols; i++) {
const int t = symbols[i] & 0xFF;
int32_t b = i - h;
while((b >= 0) && freqs[symbols[b]] < freqs[t]
|| (freqs[t] == freqs[symbols[b]]) && t < symbols[b])
{ symbols[b + h] = symbols[b]; b -= h; }
symbols[b + h] = t;
}
if(h == 1)
break;
}
return nb_symbols;
}

static int encode_header(uint32_t * freqs, uint8_t * dst) {
uint32_t idx = 0;
for(int i = 0; i < 256; i++) {
uint32_t f = freqs[i];
while(f >= 128) {
dst[idx++] = (uint8_t) (f | 0x80);
f >>= 7;
}
dst[idx++] = (uint8_t) f;
}
return idx;
}

static int decode_header(uint8_t * src, uint32_t * freqs) {
uint32_t idx = 0;
for(int i = 0; i < 256; i++) {
int val = src[idx++] & 0xFF;
int res = val & 0x7F;
int shift = 7;
while(val >= 128) {
val = src[idx++] & 0xFF;
res |= (val & 0x7F) << shift;
if(shift > 21)
break;
shift += 7;
}
freqs[i] = res;
}
return idx;
}

uint32_t srt_encode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) {
// Find first symbols and build a histogram.
for(int i = 0; i < 256; i++)
mtf->freqs[i] = 0;
for(uint32_t i = 0, b = 0; i < count;) {
if(mtf->freqs[src[i]] == 0) {
mtf->r2s[b] = src[i];
mtf->s2r[src[i]] = b;
b++;
}
uint32_t j = i + 1;
while(j < count && src[j] == src[i])
j++;
mtf->freqs[src[i]] += j - i;
i = j;
}

int n_symbols = preprocess(mtf->freqs, mtf->symbols);
for(uint32_t i = 0, bucket_pos = 0; i < n_symbols; i++) {
mtf->buckets[mtf->symbols[i]] = bucket_pos;
bucket_pos += mtf->freqs[mtf->symbols[i]];
}

const uint32_t header_size = encode_header(mtf->freqs, dst);
const int dst_idx = header_size;
for(uint32_t i = 0; i < count; ) {
const int c = src[i] & 0xFF;
int r = mtf->s2r[c] & 0xFF;
uint32_t p = mtf->buckets[c];
dst[dst_idx + p++] = r;
if(r != 0) {
do {
mtf->r2s[r] = mtf->r2s[r - 1];
mtf->s2r[mtf->r2s[r]] = r;
r--;
} while(r != 0);
mtf->r2s[0] = c;
mtf->s2r[c] = 0;
}
i++;
while(i < count && src[i] == c) {
dst[dst_idx + p++] = 0;
i++;
}
mtf->buckets[c] = p;
}
return count + header_size;
}

uint32_t srt_decode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) {
const uint32_t header_size = decode_header(src, mtf->freqs);
const uint32_t src_idx = header_size;
int nb_symbols = preprocess(mtf->freqs, mtf->symbols);
for(uint32_t i = 0, bucket_pos = 0; i < nb_symbols; i++) {
const int c = mtf->symbols[i] & 0xFF;
mtf->r2s[src[src_idx + bucket_pos] & 0xFF] = c;
mtf->buckets[c] = bucket_pos + 1;
bucket_pos += mtf->freqs[c];
mtf->bucket_ends[c] = bucket_pos;
}
uint32_t c = mtf->r2s[0];
for(uint32_t i = 0; i < count; i++) {
dst[i] = c;
if(mtf->buckets[c] < mtf->bucket_ends[c]) {
const int r = src[src_idx + mtf->buckets[c]] & 0xFF;
mtf->buckets[c]++;
if(r == 0)
continue;
for(int s = 0; s < r; s++)
mtf->r2s[s] = mtf->r2s[s + 1];
mtf->r2s[r] = c;
c = mtf->r2s[0];
} else {
if(nb_symbols == 1)
continue;
nb_symbols--;
for(int s = 0; s < nb_symbols; s++)
mtf->r2s[s] = mtf->r2s[s + 1];
c = mtf->r2s[0];
}
}
return count - header_size;
}

#endif

0 comments on commit 2a30fc3

Please sign in to comment.