3 /* Scan file for objects and reconstruct xref table */
5 /* Define in PDF 1.7 to be 8388607, but mupdf is more lenient. */
6 #define MAX_OBJECT_NUMBER (10 << 20)
18 pdf_repair_obj(pdf_document *doc, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int *tmpofs)
22 fz_stream *file = doc->file;
23 fz_context *ctx = file->ctx;
31 /* On entry to this function, we know that we've just seen
32 * '<int> <int> obj'. We expect the next thing we see to be a
33 * pdf object. Regardless of the type of thing we meet next
34 * we only need to fully parse it if it is a dictionary. */
35 tok = pdf_lex(file, buf);
37 if (tok == PDF_TOK_OPEN_DICT)
41 /* Send NULL xref so we don't try to resolve references */
44 dict = pdf_parse_dict(doc, file, buf);
48 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
49 /* Don't let a broken object at EOF overwrite a good one */
51 fz_rethrow_message(ctx, "broken object at EOF ignored");
52 /* Silently swallow the error */
53 dict = pdf_new_dict(doc, 2);
58 obj = pdf_dict_gets(dict, "Type");
59 if (pdf_is_name(obj) && !strcmp(pdf_to_name(obj), "XRef"))
61 obj = pdf_dict_gets(dict, "Encrypt");
64 pdf_drop_obj(*encrypt);
65 *encrypt = pdf_keep_obj(obj);
68 obj = pdf_dict_gets(dict, "ID");
72 *id = pdf_keep_obj(obj);
77 obj = pdf_dict_gets(dict, "Length");
78 if (!pdf_is_indirect(obj) && pdf_is_int(obj))
79 stm_len = pdf_to_int(obj);
81 if (doc->file_reading_linearly && page)
83 obj = pdf_dict_gets(dict, "Type");
84 if (!strcmp(pdf_to_name(obj), "Page"))
87 *page = pdf_keep_obj(dict);
94 while ( tok != PDF_TOK_STREAM &&
95 tok != PDF_TOK_ENDOBJ &&
96 tok != PDF_TOK_ERROR &&
100 *tmpofs = fz_tell(file);
102 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
103 tok = pdf_lex(file, buf);
106 if (tok == PDF_TOK_STREAM)
108 int c = fz_read_byte(file);
110 c = fz_peek_byte(file);
115 *stmofsp = fz_tell(file);
117 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot seek in file");
121 fz_seek(file, *stmofsp + stm_len, 0);
124 tok = pdf_lex(file, buf);
128 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
129 fz_warn(ctx, "cannot find endstream token, falling back to scanning");
131 if (tok == PDF_TOK_ENDSTREAM)
133 fz_seek(file, *stmofsp, 0);
136 (void)fz_read(file, (unsigned char *) buf->scratch, 9);
138 while (memcmp(buf->scratch, "endstream", 9) != 0)
140 c = fz_read_byte(file);
143 memmove(&buf->scratch[0], &buf->scratch[1], 8);
148 *stmlenp = fz_tell(file) - *stmofsp - 9;
151 *tmpofs = fz_tell(file);
153 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
154 tok = pdf_lex(file, buf);
155 if (tok != PDF_TOK_ENDOBJ)
156 fz_warn(ctx, "object missing 'endobj' token");
159 /* Read another token as we always return the next one */
160 *tmpofs = fz_tell(file);
162 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
163 tok = pdf_lex(file, buf);
170 pdf_repair_obj_stm(pdf_document *doc, int num, int gen)
173 fz_stream *stm = NULL;
176 fz_context *ctx = doc->ctx;
181 pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
185 obj = pdf_load_object(doc, num, gen);
187 count = pdf_to_int(pdf_dict_gets(obj, "N"));
191 stm = pdf_open_stream(doc, num, gen);
193 for (i = 0; i < count; i++)
195 pdf_xref_entry *entry;
197 tok = pdf_lex(stm, &buf);
198 if (tok != PDF_TOK_INT)
199 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
204 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
207 else if (n >= pdf_xref_len(doc))
209 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
213 entry = pdf_get_populating_xref_entry(doc, n);
217 pdf_drop_obj(entry->obj);
221 tok = pdf_lex(stm, &buf);
222 if (tok != PDF_TOK_INT)
223 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
229 pdf_lexbuf_fin(&buf);
233 fz_rethrow_message(ctx, "cannot load object stream object (%d %d R)", num, gen);
238 pdf_repair_xref(pdf_document *doc)
240 pdf_obj *dict, *obj = NULL;
243 pdf_obj *encrypt = NULL;
245 pdf_obj *root = NULL;
246 pdf_obj *info = NULL;
248 struct entry *list = NULL;
255 int tmpofs, numofs = 0, genofs = 0;
256 int stm_len, stm_ofs;
260 fz_context *ctx = doc->ctx;
261 pdf_lexbuf *buf = &doc->lexbuf.base;
270 if (doc->repair_attempted)
271 fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
272 doc->repair_attempted = 1;
275 /* Can't support incremental update after repair */
276 doc->freeze_updates = 1;
278 fz_seek(doc->file, 0, 0);
282 pdf_xref_entry *entry;
285 list = fz_malloc_array(ctx, listcap, sizeof(struct entry));
287 /* look for '%PDF' version marker within first kilobyte of file */
288 n = fz_read(doc->file, (unsigned char *)buf->scratch, fz_mini(buf->size, 1024));
290 fz_seek(doc->file, 0, 0);
291 for (i = 0; i < n - 4; i++)
293 if (memcmp(&buf->scratch[i], "%PDF", 4) == 0)
295 fz_seek(doc->file, i + 8, 0); /* skip "%PDF-X.Y" */
300 /* skip comment line after version marker since some generators
301 * forget to terminate the comment with a newline */
302 c = fz_read_byte(doc->file);
303 while (c >= 0 && (c == ' ' || c == '%'))
304 c = fz_read_byte(doc->file);
305 fz_unread_byte(doc->file);
309 tmpofs = fz_tell(doc->file);
311 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
315 tok = pdf_lex_no_string(doc->file, buf);
319 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
320 fz_warn(ctx, "ignoring the rest of the file");
324 /* If we have the next token already, then we'll jump
325 * back here, rather than going through the top of
329 if (tok == PDF_TOK_INT)
343 else if (tok == PDF_TOK_OBJ)
349 tok = pdf_repair_obj(doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs);
353 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
354 /* If we haven't seen a root yet, there is nothing
355 * we can do, but give up. Otherwise, we'll make
359 fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
363 if (num <= 0 || num > MAX_OBJECT_NUMBER)
365 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
366 goto have_next_token;
369 gen = fz_clampi(gen, 0, 65535);
371 if (listlen + 1 == listcap)
373 listcap = (listcap * 3) / 2;
374 list = fz_resize_array(ctx, list, listcap, sizeof(struct entry));
377 list[listlen].num = num;
378 list[listlen].gen = gen;
379 list[listlen].ofs = numofs;
380 list[listlen].stm_ofs = stm_ofs;
381 list[listlen].stm_len = stm_len;
387 goto have_next_token;
390 /* If we find a dictionary it is probably the trailer,
391 * but could be a stream (or bogus) dictionary caused
392 * by a corrupt file. */
393 else if (tok == PDF_TOK_OPEN_DICT)
397 dict = pdf_parse_dict(doc, doc->file, buf);
401 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
402 /* If this was the real trailer dict
403 * it was broken, in which case we are
404 * in trouble. Keep going though in
405 * case this was just a bogus dict. */
409 obj = pdf_dict_gets(dict, "Encrypt");
412 pdf_drop_obj(encrypt);
413 encrypt = pdf_keep_obj(obj);
416 obj = pdf_dict_gets(dict, "ID");
417 if (obj && (!id || !encrypt || pdf_dict_gets(dict, "Encrypt")))
420 id = pdf_keep_obj(obj);
423 obj = pdf_dict_gets(dict, "Root");
427 root = pdf_keep_obj(obj);
430 obj = pdf_dict_gets(dict, "Info");
434 info = pdf_keep_obj(obj);
441 else if (tok == PDF_TOK_EOF)
445 if (tok == PDF_TOK_ERROR)
446 fz_read_byte(doc->file);
453 /* make xref reasonable */
456 Dummy access to entry to assure sufficient space in the xref table
457 and avoid repeated reallocs in the loop
459 /* Ensure that the first xref table is a 'solid' one from
461 pdf_ensure_solid_xref(doc, maxnum);
463 for (i = 0; i < listlen; i++)
465 entry = pdf_get_populating_xref_entry(doc, list[i].num);
467 entry->ofs = list[i].ofs;
468 entry->gen = list[i].gen;
470 entry->stm_ofs = list[i].stm_ofs;
472 /* correct stream length for unencrypted documents */
473 if (!encrypt && list[i].stm_len >= 0)
475 dict = pdf_load_object(doc, list[i].num, list[i].gen);
477 length = pdf_new_int(doc, list[i].stm_len);
478 pdf_dict_puts(dict, "Length", length);
479 pdf_drop_obj(length);
485 entry = pdf_get_populating_xref_entry(doc, 0);
492 for (i = pdf_xref_len(doc) - 1; i >= 0; i--)
494 entry = pdf_get_populating_xref_entry(doc, i);
495 if (entry->type == 'f')
498 if (entry->gen < 65535)
504 /* create a repaired trailer, Root will be added later */
506 obj = pdf_new_dict(doc, 5);
507 /* During repair there is only a single xref section */
508 pdf_set_populating_xref_trailer(doc, obj);
512 obj = pdf_new_int(doc, maxnum + 1);
513 pdf_dict_puts(pdf_trailer(doc), "Size", obj);
519 pdf_dict_puts(pdf_trailer(doc), "Root", root);
525 pdf_dict_puts(pdf_trailer(doc), "Info", info);
532 if (pdf_is_indirect(encrypt))
534 /* create new reference with non-NULL xref pointer */
535 obj = pdf_new_indirect(doc, pdf_to_num(encrypt), pdf_to_gen(encrypt));
536 pdf_drop_obj(encrypt);
540 pdf_dict_puts(pdf_trailer(doc), "Encrypt", encrypt);
541 pdf_drop_obj(encrypt);
547 if (pdf_is_indirect(id))
549 /* create new reference with non-NULL xref pointer */
550 obj = pdf_new_indirect(doc, pdf_to_num(id), pdf_to_gen(id));
555 pdf_dict_puts(pdf_trailer(doc), "ID", id);
564 pdf_drop_obj(encrypt);
575 pdf_repair_obj_stms(pdf_document *doc)
577 fz_context *ctx = doc->ctx;
580 int xref_len = pdf_xref_len(doc);
582 for (i = 0; i < xref_len; i++)
584 pdf_xref_entry *entry = pdf_get_populating_xref_entry(doc, i);
588 dict = pdf_load_object(doc, i, 0);
591 if (!strcmp(pdf_to_name(pdf_dict_gets(dict, "Type")), "ObjStm"))
592 pdf_repair_obj_stm(doc, i, 0);
596 fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
602 /* Ensure that streamed objects reside inside a known non-streamed object */
603 for (i = 0; i < xref_len; i++)
605 pdf_xref_entry *entry = pdf_get_populating_xref_entry(doc, i);
607 if (entry->type == 'o' && pdf_get_populating_xref_entry(doc, entry->ofs)->type != 'n')
608 fz_throw(doc->ctx, FZ_ERROR_GENERIC, "invalid reference to non-object-stream: %d (%d 0 R)", entry->ofs, i);