Skip to content

Commit 6eb06ab

Browse files
author
Steven Wart
committed
First commit
0 parents  commit 6eb06ab

File tree

3 files changed

+278
-0
lines changed

3 files changed

+278
-0
lines changed

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
CC=gcc
2+
3+
ifeq ($(CC),gcc)
4+
CFLAGS=-I/usr/include/bsd -DLIBBSD_OVERLAY -lbsd
5+
endif
6+
7+
convert_to_labeled_directories: convert_to_labeled_directories.c
8+
$(CC) $(CFLAGS) convert_to_labeled_directories.c -o convert_to_labeled_directories

README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# MNIST to PGM Converter
2+
3+
This utility will convert raw MNIST data files to PGM image files in labeled directories.
4+
5+
## Motivation
6+
7+
MNIST is a database of handwritten digits that are commonly used for machine learning tutorials.
8+
9+
PGM is a [very simple image format](http://netpbm.sourceforge.net/doc/pgm.html) for grayscale images.
10+
11+
Apple's Create ML framework will import images data from these directories, and it may be useful
12+
to view the images directly from the file system.
13+
14+
If you are using Python you are unlikely to need this.
15+
16+
## Alternatives
17+
18+
Most people learning the MNIST tutorial use Keras
19+
[mnist.load_data()](https://github.com/keras-team/keras/blob/v2.8.0/keras/datasets/mnist.py) function
20+
from a Python workspace to obtain their data.
21+
22+
The Keras function does not return the data directly from the [MNIST Homepage](http://yann.lecun.com/exdb/mnist/)
23+
(fortunately, because automated scripts should not do that), but it takes a "pickled" or serialized
24+
Python binary object file from the Google servers.
25+
26+
## Usage
27+
28+
The Keras Python binary is not suitable for this, and neither is the raw data from the MNIST page.
29+
30+
To use this utility, download labels and images from the MNIST data, uncompress them, and run the following command
31+
to put them into labeled directories:
32+
33+
`mkdir train`
34+
`mkdir test`
35+
`convert_to_labeled_directories train-labels-idx1-ubyte train-images-idx3-ubyte train`
36+
`convert_to_labeled_directories t10k-labels-idx1-ubyte t10k-images-idx3-ubyte test`
37+
38+
## Compiling
39+
40+
To compile on Mac OS X: `clang convert_to_labeled_directories.c -o convert_to_labeled_directories`
41+
42+
To compile on Linux: `gcc -I/usr/include/bsd -DLIBBSD_OVERLAY -lbsd -o convert_to_labeled_directories`
43+

convert_to_labeled_directories.c

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/*
2+
* convert_to_labeled_directories.c
3+
*
4+
* Usage: convert_to_labeled_directories <labels_file> <images_file>
5+
*
6+
* Read the data from the raw MNIST format (either training or test)
7+
*
8+
*/
9+
10+
#include <stdio.h>
11+
#include <stdlib.h>
12+
#include <strings.h>
13+
#include <sys/types.h>
14+
#include <sys/stat.h>
15+
16+
#ifdef __GNUC__
17+
#include <string.h>
18+
#include <arpa/inet.h>
19+
#endif
20+
21+
// Illusion of type safety
22+
typedef unsigned char byte;
23+
typedef int bool;
24+
25+
// Function Declarations
26+
int read_int(FILE *file);
27+
byte *read_labels(char *filename);
28+
byte *read_images(char *filename);
29+
bool isDirectory(char *path);
30+
void write_pgm_image(char *directory, char *filename, byte *image);
31+
32+
// Global State
33+
int num_labels = 0;
34+
int num_images = 0;
35+
int width = 0;
36+
int height = 0;
37+
38+
// Useful Constants
39+
#define LABEL_MAGIC 2049
40+
#define IMAGE_MAGIC 2051
41+
#define PGM_MAGIC "P5"
42+
#define MAX_GRAY 255
43+
#define MAX_PATH 255
44+
#define TRUE 1
45+
#define FALSE 0
46+
47+
int main(int argc, char **argv) {
48+
if (argc != 4) {
49+
printf("Usage: convert_to_labeled_directories <labels_file> <images_file> <output_dir>\n");
50+
exit(1);
51+
}
52+
char *labels_file = argv[1];
53+
char *images_file = argv[2];
54+
char *output_dir = argv[3];
55+
56+
byte *labels = read_labels(labels_file);
57+
printf("Read %d labels from: %s\n", num_labels, labels_file);
58+
59+
byte *images = read_images(images_file);
60+
printf("Read %d images from: %s\n", num_images, images_file);
61+
62+
if (num_images != num_labels) {
63+
fprintf(stderr, "Number of labels and number of images do not match, stopping.\n");
64+
exit(1);
65+
}
66+
67+
if (!isDirectory(output_dir)) {
68+
fprintf(stderr, "Directory %s does not exist, please create it and try again.\n", output_dir);
69+
exit(1);
70+
}
71+
72+
char directory[MAX_PATH];
73+
char filename[MAX_PATH];
74+
75+
for (int i = 0; i < num_images; i++) {
76+
byte *current_image = &images[i * width * height];
77+
byte label = labels[i];
78+
79+
bzero(directory, MAX_PATH);
80+
if (snprintf(directory, MAX_PATH, "%s/%d/", output_dir, label) >= MAX_PATH) {
81+
fprintf(stderr, "Directory path too long: %s, stopping.\n", directory);
82+
exit(1);
83+
}
84+
85+
bzero(filename, MAX_PATH);
86+
if (snprintf(filename, MAX_PATH, "image%d.pgm", i) >= MAX_PATH) {
87+
fprintf(stderr, "Filename too long: %s, stopping.\n", filename);
88+
exit(1);
89+
}
90+
write_pgm_image(directory, filename, current_image);
91+
}
92+
93+
free(labels);
94+
free(images);
95+
}
96+
97+
/*
98+
* Open the file, allocate a buffer, and read the images data
99+
* The caller should free the memory when done.
100+
*/
101+
byte *read_images(char *filename) {
102+
FILE *file = fopen(filename, "r");
103+
if (file == NULL) {
104+
perror("Unable to read images file");
105+
exit(1);
106+
}
107+
108+
int magic = read_int(file);
109+
if (magic != IMAGE_MAGIC) {
110+
fprintf(stderr, "%s is not an image file -- magic number does not match\n", filename);
111+
exit(1);
112+
}
113+
114+
num_images = read_int(file);
115+
height = read_int(file);
116+
width = read_int(file);
117+
if (num_images <= 0 || height <= 0 || width <= 0) {
118+
fprintf(stderr, "Error reading image data: num_images=%d, height=%d, width=%d\n", num_images, height, width);
119+
exit(1);
120+
}
121+
122+
int num_bytes = num_images * height * width;
123+
byte *images = calloc(num_bytes, sizeof(byte));
124+
125+
int bytes_read = fread(images, sizeof(byte), num_bytes, file);
126+
if (bytes_read != num_bytes) {
127+
fprintf(stderr, "Error read %d bytes from images file, expected %d\n", bytes_read, num_bytes);
128+
exit(1);
129+
}
130+
fclose(file);
131+
132+
return images;
133+
}
134+
135+
/*
136+
* Open the file, allocate a buffer, and read the label data
137+
* The caller should free the memory when done.
138+
*/
139+
byte *read_labels(char *filename) {
140+
FILE *file = fopen(filename, "r");
141+
if (file == NULL) {
142+
perror("Unable to read labels file");
143+
exit(1);
144+
}
145+
146+
int magic = read_int(file);
147+
148+
if (magic != LABEL_MAGIC) {
149+
fprintf(stderr, "%s is not a label file -- magic number does not match\n", filename);
150+
exit(1);
151+
}
152+
153+
num_labels = read_int(file);
154+
155+
byte *labels = calloc(num_labels, sizeof(byte));
156+
int items_read = fread(labels, sizeof(byte), num_labels, file);
157+
if (items_read != num_labels) {
158+
fprintf(stderr, "Error reading from label file\n");
159+
exit(1);
160+
}
161+
fclose(file);
162+
163+
return labels;
164+
}
165+
166+
/*
167+
* Read an unsigned 32-bit integer value in big-endian format
168+
*/
169+
int read_int(FILE *file) {
170+
int item = 0;
171+
int items_read = fread(&item, sizeof(int), 1, file);
172+
if (items_read != 1) {
173+
fprintf(stderr, "Error reading int from file\n");
174+
exit(1);
175+
}
176+
return ntohl(item);
177+
}
178+
179+
/*
180+
* Return a non-zero value if and only if the path is an accessible directory
181+
*/
182+
bool isDirectory(char *path) {
183+
struct stat info;
184+
185+
if (stat(path, &info) != 0) {
186+
return FALSE;
187+
}
188+
return (info.st_mode & S_IFDIR);
189+
}
190+
191+
/*
192+
* Save the image in PGM format with the path specified by the directory and filename provided
193+
*/
194+
void write_pgm_image(char *directory, char *filename, byte *image) {
195+
if (!isDirectory(directory)) {
196+
if (mkdir(directory, 0755) < 0) {
197+
perror("Unable to create directory");
198+
exit(1);
199+
}
200+
}
201+
202+
char path[MAX_PATH];
203+
bzero(&path, MAX_PATH);
204+
if (strlcpy(path, directory, MAX_PATH) >= MAX_PATH) {
205+
fprintf(stderr, "Directory path too long: %s, stopping.\n", path);
206+
exit(1);
207+
}
208+
if (strlcat(path, filename, MAX_PATH) >= MAX_PATH) {
209+
fprintf(stderr, "File path too long: %s, stopping.\n", path);
210+
exit(1);
211+
}
212+
213+
FILE *outfile = fopen(path, "w");
214+
if (outfile == NULL) {
215+
perror("Unable to open image file for writing");
216+
exit(1);
217+
}
218+
219+
fprintf(outfile, "%s %d %d %d\n", PGM_MAGIC, width, height, MAX_GRAY);
220+
int num_pixels = width * height;
221+
if (fwrite(image, sizeof(byte), num_pixels, outfile) != num_pixels) {
222+
perror("Unable to write to image file");
223+
exit(1);
224+
}
225+
226+
fclose(outfile);
227+
}

0 commit comments

Comments
 (0)