Skip to content

Commit b0fa9d9

Browse files
committed
Fixing bugs
* fixed invalid json format when images occur * immunize against invalid invocation of doc.select() method (invalid page number range, invalid document type)
1 parent a95e492 commit b0fa9d9

File tree

6 files changed

+41
-18
lines changed

6 files changed

+41
-18
lines changed

examples/PDF2TextJS.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def SortBlocks(blocks):
3333

3434
sblocks = []
3535
for b in blocks:
36+
if b["type"] != "text": # only look at text blocks
37+
continue
3638
x0 = str(int(round(b["bbox"][0],0))).rjust(4,"0") # x coord in pixels
3739
y0 = str(int(round(b["bbox"][1],0))).rjust(4,"0") # y coord in pixels
3840
sortkey = y0 + x0 # = "yx"

fitz/fitz.i

+18-4
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,16 @@ struct fz_document_s {
269269
PyObject *o = PySequence_GetItem($input,i);
270270
if (PyInt_Check(o)) {
271271
$1[i] = (int) PyInt_AsLong(o);
272+
if ($1[i] < 0) {
273+
PyErr_SetString(PyExc_ValueError,"sequence elements must be >= 0");
274+
free($1);
275+
return NULL;
276+
}
277+
if ($1[i] >= fz_count_pages(gctx, arg1)) {
278+
PyErr_SetString(PyExc_ValueError,"sequence elements must be < pageCount");
279+
free($1);
280+
return NULL;
281+
}
272282
}
273283
else {
274284
PyErr_SetString(PyExc_ValueError,"sequence elements must be integers");
@@ -289,6 +299,7 @@ struct fz_document_s {
289299
pdf_document *pdf = pdf_specifics(gctx, $self);
290300
if (!pdf) {
291301
PyErr_SetString(PyExc_ValueError,"not a valid pdf document");
302+
free(liste);
292303
return -2;
293304
}
294305
globals glo = { 0 };
@@ -1411,8 +1422,11 @@ fz_send_data_base64(fz_context *ctx, fz_output *out, fz_buffer *buffer)
14111422
int c = buffer->data[3*i];
14121423
int d = buffer->data[3*i+1];
14131424
int e = buffer->data[3*i+2];
1414-
if ((i & 15) == 0)
1415-
fz_printf(ctx, out, "\n");
1425+
/*************************************************/
1426+
/* JSON decoders do not like interspersed "\n" ! */
1427+
/*************************************************/
1428+
//if ((i & 15) == 0)
1429+
// fz_printf(ctx, out, "\n");
14161430
fz_printf(ctx, out, "%c%c%c%c", set[c>>2], set[((c&3)<<4)|(d>>4)], set[((d&15)<<2)|(e>>6)], set[e & 63]);
14171431
}
14181432
i *= 3;
@@ -1499,11 +1513,11 @@ fz_print_stext_page_json(fz_context *ctx, fz_output *out, fz_stext_page *page)
14991513
fz_image_block *image = page->blocks[block_n].u.image;
15001514

15011515
fz_print_rect_json(ctx, out, &(image->bbox));
1502-
fz_printf(ctx, out, "\"type\":%d,\"width\":%d,\"height\":%d",
1516+
fz_printf(ctx, out, "\"imgtype\":%d,\"width\":%d,\"height\":%d,",
15031517
image->image->buffer->params.type,
15041518
image->image->w,
15051519
image->image->h);
1506-
fz_printf(ctx, out, "\"image\":");
1520+
fz_printf(ctx, out, "\"image\":\n");
15071521
if (image->image->buffer == NULL) {
15081522
fz_printf(ctx, out, "null");
15091523
} else {

fitz/fitz.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class _object:
9494
import os
9595
VersionFitz = "1.9"
9696
VersionBind = "1.9.0"
97-
VersionDate = "2016-04-30 9:07:17"
97+
VersionDate = "2016-05-01 4:00:38"
9898

9999
class Document(_object):
100100
"""Proxy of C fz_document_s struct."""

fitz/fitz_wrap.c

+18-4
Original file line numberDiff line numberDiff line change
@@ -3769,6 +3769,7 @@ SWIGINTERN int fz_document_s__select(struct fz_document_s *self,int *liste,int a
37693769
pdf_document *pdf = pdf_specifics(gctx, self);
37703770
if (!pdf) {
37713771
PyErr_SetString(PyExc_ValueError,"not a valid pdf document");
3772+
free(liste);
37723773
return -2;
37733774
}
37743775
globals glo = { 0 };
@@ -4312,8 +4313,11 @@ fz_send_data_base64(fz_context *ctx, fz_output *out, fz_buffer *buffer)
43124313
int c = buffer->data[3*i];
43134314
int d = buffer->data[3*i+1];
43144315
int e = buffer->data[3*i+2];
4315-
if ((i & 15) == 0)
4316-
fz_printf(ctx, out, "\n");
4316+
/*************************************************/
4317+
/* JSON decoders do not like interspersed "\n" ! */
4318+
/*************************************************/
4319+
//if ((i & 15) == 0)
4320+
// fz_printf(ctx, out, "\n");
43174321
fz_printf(ctx, out, "%c%c%c%c", set[c>>2], set[((c&3)<<4)|(d>>4)], set[((d&15)<<2)|(e>>6)], set[e & 63]);
43184322
}
43194323
i *= 3;
@@ -4400,11 +4404,11 @@ fz_print_stext_page_json(fz_context *ctx, fz_output *out, fz_stext_page *page)
44004404
fz_image_block *image = page->blocks[block_n].u.image;
44014405

44024406
fz_print_rect_json(ctx, out, &(image->bbox));
4403-
fz_printf(ctx, out, "\"type\":%d,\"width\":%d,\"height\":%d",
4407+
fz_printf(ctx, out, "\"imgtype\":%d,\"width\":%d,\"height\":%d,",
44044408
image->image->buffer->params.type,
44054409
image->image->w,
44064410
image->image->h);
4407-
fz_printf(ctx, out, "\"image\":");
4411+
fz_printf(ctx, out, "\"image\":\n");
44084412
if (image->image->buffer == NULL) {
44094413
fz_printf(ctx, out, "null");
44104414
} else {
@@ -4958,6 +4962,16 @@ SWIGINTERN PyObject *_wrap_Document__select(PyObject *SWIGUNUSEDPARM(self), PyOb
49584962
PyObject *o = PySequence_GetItem(obj1,i);
49594963
if (PyInt_Check(o)) {
49604964
arg2[i] = (int) PyInt_AsLong(o);
4965+
if (arg2[i] < 0) {
4966+
PyErr_SetString(PyExc_ValueError,"sequence elements must be >= 0");
4967+
free(arg2);
4968+
return NULL;
4969+
}
4970+
if (arg2[i] >= fz_count_pages(gctx, arg1)) {
4971+
PyErr_SetString(PyExc_ValueError,"sequence elements must be < pageCount");
4972+
free(arg2);
4973+
return NULL;
4974+
}
49614975
}
49624976
else {
49634977
PyErr_SetString(PyExc_ValueError,"sequence elements must be integers");

fitz/utils.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,7 @@ def select(*arg):
2323
raise ValueError("operation on closed document")
2424
if not doc.name.lower().endswith(("/pdf", ".pdf")):
2525
raise ValueError("only PDF documents supported")
26-
if not isinstance(liste, types.ListType):
27-
raise ValueError("must provide a list of pages")
28-
for l in liste:
29-
if not isinstance(l, numbers.Integral):
30-
raise ValueError("must be sequence of integers")
31-
if l < 0 or l >= doc.pageCount:
32-
raise ValueError("some page numbers outside valid range")
33-
doc._select(liste)
26+
return doc._select(liste)
3427

3528
#==============================================================================
3629
# A function for searching string occurrences on a page.

fitz/version.i

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
import os
33
VersionFitz = "1.9"
44
VersionBind = "1.9.0"
5-
VersionDate = "2016-04-30 9:07:17"
5+
VersionDate = "2016-05-01 4:00:38"
66
%}

0 commit comments

Comments
 (0)