|
| 1 | +/* |
| 2 | + * convert_to_labeled_directories.c |
| 3 | + * |
| 4 | + * Usage: convert_to_labeled_directories <labels_file> <images_file> |
| 5 | + * |
| 6 | + * Read the data from the raw MNIST format (either training or test) |
| 7 | + * |
| 8 | + */ |
| 9 | + |
| 10 | +#include <stdio.h> |
| 11 | +#include <stdlib.h> |
| 12 | +#include <strings.h> |
| 13 | +#include <sys/types.h> |
| 14 | +#include <sys/stat.h> |
| 15 | + |
| 16 | +#ifdef __GNUC__ |
| 17 | +#include <string.h> |
| 18 | +#include <arpa/inet.h> |
| 19 | +#endif |
| 20 | + |
| 21 | +// Illusion of type safety |
| 22 | +typedef unsigned char byte; |
| 23 | +typedef int bool; |
| 24 | + |
| 25 | +// Function Declarations |
| 26 | +int read_int(FILE *file); |
| 27 | +byte *read_labels(char *filename); |
| 28 | +byte *read_images(char *filename); |
| 29 | +bool isDirectory(char *path); |
| 30 | +void write_pgm_image(char *directory, char *filename, byte *image); |
| 31 | + |
| 32 | +// Global State |
| 33 | +int num_labels = 0; |
| 34 | +int num_images = 0; |
| 35 | +int width = 0; |
| 36 | +int height = 0; |
| 37 | + |
| 38 | +// Useful Constants |
| 39 | +#define LABEL_MAGIC 2049 |
| 40 | +#define IMAGE_MAGIC 2051 |
| 41 | +#define PGM_MAGIC "P5" |
| 42 | +#define MAX_GRAY 255 |
| 43 | +#define MAX_PATH 255 |
| 44 | +#define TRUE 1 |
| 45 | +#define FALSE 0 |
| 46 | + |
| 47 | +int main(int argc, char **argv) { |
| 48 | + if (argc != 4) { |
| 49 | + printf("Usage: convert_to_labeled_directories <labels_file> <images_file> <output_dir>\n"); |
| 50 | + exit(1); |
| 51 | + } |
| 52 | + char *labels_file = argv[1]; |
| 53 | + char *images_file = argv[2]; |
| 54 | + char *output_dir = argv[3]; |
| 55 | + |
| 56 | + byte *labels = read_labels(labels_file); |
| 57 | + printf("Read %d labels from: %s\n", num_labels, labels_file); |
| 58 | + |
| 59 | + byte *images = read_images(images_file); |
| 60 | + printf("Read %d images from: %s\n", num_images, images_file); |
| 61 | + |
| 62 | + if (num_images != num_labels) { |
| 63 | + fprintf(stderr, "Number of labels and number of images do not match, stopping.\n"); |
| 64 | + exit(1); |
| 65 | + } |
| 66 | + |
| 67 | + if (!isDirectory(output_dir)) { |
| 68 | + fprintf(stderr, "Directory %s does not exist, please create it and try again.\n", output_dir); |
| 69 | + exit(1); |
| 70 | + } |
| 71 | + |
| 72 | + char directory[MAX_PATH]; |
| 73 | + char filename[MAX_PATH]; |
| 74 | + |
| 75 | + for (int i = 0; i < num_images; i++) { |
| 76 | + byte *current_image = &images[i * width * height]; |
| 77 | + byte label = labels[i]; |
| 78 | + |
| 79 | + bzero(directory, MAX_PATH); |
| 80 | + if (snprintf(directory, MAX_PATH, "%s/%d/", output_dir, label) >= MAX_PATH) { |
| 81 | + fprintf(stderr, "Directory path too long: %s, stopping.\n", directory); |
| 82 | + exit(1); |
| 83 | + } |
| 84 | + |
| 85 | + bzero(filename, MAX_PATH); |
| 86 | + if (snprintf(filename, MAX_PATH, "image%d.pgm", i) >= MAX_PATH) { |
| 87 | + fprintf(stderr, "Filename too long: %s, stopping.\n", filename); |
| 88 | + exit(1); |
| 89 | + } |
| 90 | + write_pgm_image(directory, filename, current_image); |
| 91 | + } |
| 92 | + |
| 93 | + free(labels); |
| 94 | + free(images); |
| 95 | +} |
| 96 | + |
| 97 | +/* |
| 98 | + * Open the file, allocate a buffer, and read the images data |
| 99 | + * The caller should free the memory when done. |
| 100 | + */ |
| 101 | +byte *read_images(char *filename) { |
| 102 | + FILE *file = fopen(filename, "r"); |
| 103 | + if (file == NULL) { |
| 104 | + perror("Unable to read images file"); |
| 105 | + exit(1); |
| 106 | + } |
| 107 | + |
| 108 | + int magic = read_int(file); |
| 109 | + if (magic != IMAGE_MAGIC) { |
| 110 | + fprintf(stderr, "%s is not an image file -- magic number does not match\n", filename); |
| 111 | + exit(1); |
| 112 | + } |
| 113 | + |
| 114 | + num_images = read_int(file); |
| 115 | + height = read_int(file); |
| 116 | + width = read_int(file); |
| 117 | + if (num_images <= 0 || height <= 0 || width <= 0) { |
| 118 | + fprintf(stderr, "Error reading image data: num_images=%d, height=%d, width=%d\n", num_images, height, width); |
| 119 | + exit(1); |
| 120 | + } |
| 121 | + |
| 122 | + int num_bytes = num_images * height * width; |
| 123 | + byte *images = calloc(num_bytes, sizeof(byte)); |
| 124 | + |
| 125 | + int bytes_read = fread(images, sizeof(byte), num_bytes, file); |
| 126 | + if (bytes_read != num_bytes) { |
| 127 | + fprintf(stderr, "Error read %d bytes from images file, expected %d\n", bytes_read, num_bytes); |
| 128 | + exit(1); |
| 129 | + } |
| 130 | + fclose(file); |
| 131 | + |
| 132 | + return images; |
| 133 | +} |
| 134 | + |
| 135 | +/* |
| 136 | + * Open the file, allocate a buffer, and read the label data |
| 137 | + * The caller should free the memory when done. |
| 138 | + */ |
| 139 | +byte *read_labels(char *filename) { |
| 140 | + FILE *file = fopen(filename, "r"); |
| 141 | + if (file == NULL) { |
| 142 | + perror("Unable to read labels file"); |
| 143 | + exit(1); |
| 144 | + } |
| 145 | + |
| 146 | + int magic = read_int(file); |
| 147 | + |
| 148 | + if (magic != LABEL_MAGIC) { |
| 149 | + fprintf(stderr, "%s is not a label file -- magic number does not match\n", filename); |
| 150 | + exit(1); |
| 151 | + } |
| 152 | + |
| 153 | + num_labels = read_int(file); |
| 154 | + |
| 155 | + byte *labels = calloc(num_labels, sizeof(byte)); |
| 156 | + int items_read = fread(labels, sizeof(byte), num_labels, file); |
| 157 | + if (items_read != num_labels) { |
| 158 | + fprintf(stderr, "Error reading from label file\n"); |
| 159 | + exit(1); |
| 160 | + } |
| 161 | + fclose(file); |
| 162 | + |
| 163 | + return labels; |
| 164 | +} |
| 165 | + |
| 166 | +/* |
| 167 | + * Read an unsigned 32-bit integer value in big-endian format |
| 168 | + */ |
| 169 | +int read_int(FILE *file) { |
| 170 | + int item = 0; |
| 171 | + int items_read = fread(&item, sizeof(int), 1, file); |
| 172 | + if (items_read != 1) { |
| 173 | + fprintf(stderr, "Error reading int from file\n"); |
| 174 | + exit(1); |
| 175 | + } |
| 176 | + return ntohl(item); |
| 177 | +} |
| 178 | + |
| 179 | +/* |
| 180 | + * Return a non-zero value if and only if the path is an accessible directory |
| 181 | + */ |
| 182 | +bool isDirectory(char *path) { |
| 183 | + struct stat info; |
| 184 | + |
| 185 | + if (stat(path, &info) != 0) { |
| 186 | + return FALSE; |
| 187 | + } |
| 188 | + return (info.st_mode & S_IFDIR); |
| 189 | +} |
| 190 | + |
| 191 | +/* |
| 192 | + * Save the image in PGM format with the path specified by the directory and filename provided |
| 193 | + */ |
| 194 | +void write_pgm_image(char *directory, char *filename, byte *image) { |
| 195 | + if (!isDirectory(directory)) { |
| 196 | + if (mkdir(directory, 0755) < 0) { |
| 197 | + perror("Unable to create directory"); |
| 198 | + exit(1); |
| 199 | + } |
| 200 | + } |
| 201 | + |
| 202 | + char path[MAX_PATH]; |
| 203 | + bzero(&path, MAX_PATH); |
| 204 | + if (strlcpy(path, directory, MAX_PATH) >= MAX_PATH) { |
| 205 | + fprintf(stderr, "Directory path too long: %s, stopping.\n", path); |
| 206 | + exit(1); |
| 207 | + } |
| 208 | + if (strlcat(path, filename, MAX_PATH) >= MAX_PATH) { |
| 209 | + fprintf(stderr, "File path too long: %s, stopping.\n", path); |
| 210 | + exit(1); |
| 211 | + } |
| 212 | + |
| 213 | + FILE *outfile = fopen(path, "w"); |
| 214 | + if (outfile == NULL) { |
| 215 | + perror("Unable to open image file for writing"); |
| 216 | + exit(1); |
| 217 | + } |
| 218 | + |
| 219 | + fprintf(outfile, "%s %d %d %d\n", PGM_MAGIC, width, height, MAX_GRAY); |
| 220 | + int num_pixels = width * height; |
| 221 | + if (fwrite(image, sizeof(byte), num_pixels, outfile) != num_pixels) { |
| 222 | + perror("Unable to write to image file"); |
| 223 | + exit(1); |
| 224 | + } |
| 225 | + |
| 226 | + fclose(outfile); |
| 227 | +} |
0 commit comments