]> rtime.felk.cvut.cz Git - hornmich/skoda-qr-demo.git/blob - QRScanner/mobile/jni/pdf/pdf-repair.c
Add MuPDF native source codes
[hornmich/skoda-qr-demo.git] / QRScanner / mobile / jni / pdf / pdf-repair.c
1 #include "mupdf/pdf.h"
2
3 /* Scan file for objects and reconstruct xref table */
4
5 /* Define in PDF 1.7 to be 8388607, but mupdf is more lenient. */
6 #define MAX_OBJECT_NUMBER (10 << 20)
7
8 struct entry
9 {
10         int num;
11         int gen;
12         int ofs;
13         int stm_ofs;
14         int stm_len;
15 };
16
17 int
18 pdf_repair_obj(pdf_document *doc, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int *tmpofs)
19 {
20         pdf_token tok;
21         int stm_len;
22         fz_stream *file = doc->file;
23         fz_context *ctx = file->ctx;
24
25         *stmofsp = 0;
26         if (stmlenp)
27                 *stmlenp = -1;
28
29         stm_len = 0;
30
31         /* On entry to this function, we know that we've just seen
32          * '<int> <int> obj'. We expect the next thing we see to be a
33          * pdf object. Regardless of the type of thing we meet next
34          * we only need to fully parse it if it is a dictionary. */
35         tok = pdf_lex(file, buf);
36
37         if (tok == PDF_TOK_OPEN_DICT)
38         {
39                 pdf_obj *dict, *obj;
40
41                 /* Send NULL xref so we don't try to resolve references */
42                 fz_try(ctx)
43                 {
44                         dict = pdf_parse_dict(doc, file, buf);
45                 }
46                 fz_catch(ctx)
47                 {
48                         fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
49                         /* Don't let a broken object at EOF overwrite a good one */
50                         if (file->eof)
51                                 fz_rethrow_message(ctx, "broken object at EOF ignored");
52                         /* Silently swallow the error */
53                         dict = pdf_new_dict(doc, 2);
54                 }
55
56                 if (encrypt && id)
57                 {
58                         obj = pdf_dict_gets(dict, "Type");
59                         if (pdf_is_name(obj) && !strcmp(pdf_to_name(obj), "XRef"))
60                         {
61                                 obj = pdf_dict_gets(dict, "Encrypt");
62                                 if (obj)
63                                 {
64                                         pdf_drop_obj(*encrypt);
65                                         *encrypt = pdf_keep_obj(obj);
66                                 }
67
68                                 obj = pdf_dict_gets(dict, "ID");
69                                 if (obj)
70                                 {
71                                         pdf_drop_obj(*id);
72                                         *id = pdf_keep_obj(obj);
73                                 }
74                         }
75                 }
76
77                 obj = pdf_dict_gets(dict, "Length");
78                 if (!pdf_is_indirect(obj) && pdf_is_int(obj))
79                         stm_len = pdf_to_int(obj);
80
81                 if (doc->file_reading_linearly && page)
82                 {
83                         obj = pdf_dict_gets(dict, "Type");
84                         if (!strcmp(pdf_to_name(obj), "Page"))
85                         {
86                                 pdf_drop_obj(*page);
87                                 *page = pdf_keep_obj(dict);
88                         }
89                 }
90
91                 pdf_drop_obj(dict);
92         }
93
94         while ( tok != PDF_TOK_STREAM &&
95                 tok != PDF_TOK_ENDOBJ &&
96                 tok != PDF_TOK_ERROR &&
97                 tok != PDF_TOK_EOF &&
98                 tok != PDF_TOK_INT )
99         {
100                 *tmpofs = fz_tell(file);
101                 if (*tmpofs < 0)
102                         fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
103                 tok = pdf_lex(file, buf);
104         }
105
106         if (tok == PDF_TOK_STREAM)
107         {
108                 int c = fz_read_byte(file);
109                 if (c == '\r') {
110                         c = fz_peek_byte(file);
111                         if (c == '\n')
112                                 fz_read_byte(file);
113                 }
114
115                 *stmofsp = fz_tell(file);
116                 if (*stmofsp < 0)
117                         fz_throw(ctx, FZ_ERROR_GENERIC, "cannot seek in file");
118
119                 if (stm_len > 0)
120                 {
121                         fz_seek(file, *stmofsp + stm_len, 0);
122                         fz_try(ctx)
123                         {
124                                 tok = pdf_lex(file, buf);
125                         }
126                         fz_catch(ctx)
127                         {
128                                 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
129                                 fz_warn(ctx, "cannot find endstream token, falling back to scanning");
130                         }
131                         if (tok == PDF_TOK_ENDSTREAM)
132                                 goto atobjend;
133                         fz_seek(file, *stmofsp, 0);
134                 }
135
136                 (void)fz_read(file, (unsigned char *) buf->scratch, 9);
137
138                 while (memcmp(buf->scratch, "endstream", 9) != 0)
139                 {
140                         c = fz_read_byte(file);
141                         if (c == EOF)
142                                 break;
143                         memmove(&buf->scratch[0], &buf->scratch[1], 8);
144                         buf->scratch[8] = c;
145                 }
146
147                 if (stmlenp)
148                         *stmlenp = fz_tell(file) - *stmofsp - 9;
149
150 atobjend:
151                 *tmpofs = fz_tell(file);
152                 if (*tmpofs < 0)
153                         fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
154                 tok = pdf_lex(file, buf);
155                 if (tok != PDF_TOK_ENDOBJ)
156                         fz_warn(ctx, "object missing 'endobj' token");
157                 else
158                 {
159                         /* Read another token as we always return the next one */
160                         *tmpofs = fz_tell(file);
161                         if (*tmpofs < 0)
162                                 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
163                         tok = pdf_lex(file, buf);
164                 }
165         }
166         return tok;
167 }
168
169 static void
170 pdf_repair_obj_stm(pdf_document *doc, int num, int gen)
171 {
172         pdf_obj *obj;
173         fz_stream *stm = NULL;
174         pdf_token tok;
175         int i, n, count;
176         fz_context *ctx = doc->ctx;
177         pdf_lexbuf buf;
178
179         fz_var(stm);
180
181         pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
182
183         fz_try(ctx)
184         {
185                 obj = pdf_load_object(doc, num, gen);
186
187                 count = pdf_to_int(pdf_dict_gets(obj, "N"));
188
189                 pdf_drop_obj(obj);
190
191                 stm = pdf_open_stream(doc, num, gen);
192
193                 for (i = 0; i < count; i++)
194                 {
195                         pdf_xref_entry *entry;
196
197                         tok = pdf_lex(stm, &buf);
198                         if (tok != PDF_TOK_INT)
199                                 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
200
201                         n = buf.i;
202                         if (n < 0)
203                         {
204                                 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
205                                 continue;
206                         }
207                         else if (n >= pdf_xref_len(doc))
208                         {
209                                 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
210                                 continue;
211                         }
212
213                         entry = pdf_get_populating_xref_entry(doc, n);
214                         entry->ofs = num;
215                         entry->gen = i;
216                         entry->stm_ofs = 0;
217                         pdf_drop_obj(entry->obj);
218                         entry->obj = NULL;
219                         entry->type = 'o';
220
221                         tok = pdf_lex(stm, &buf);
222                         if (tok != PDF_TOK_INT)
223                                 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
224                 }
225         }
226         fz_always(ctx)
227         {
228                 fz_close(stm);
229                 pdf_lexbuf_fin(&buf);
230         }
231         fz_catch(ctx)
232         {
233                 fz_rethrow_message(ctx, "cannot load object stream object (%d %d R)", num, gen);
234         }
235 }
236
237 void
238 pdf_repair_xref(pdf_document *doc)
239 {
240         pdf_obj *dict, *obj = NULL;
241         pdf_obj *length;
242
243         pdf_obj *encrypt = NULL;
244         pdf_obj *id = NULL;
245         pdf_obj *root = NULL;
246         pdf_obj *info = NULL;
247
248         struct entry *list = NULL;
249         int listlen;
250         int listcap;
251         int maxnum = 0;
252
253         int num = 0;
254         int gen = 0;
255         int tmpofs, numofs = 0, genofs = 0;
256         int stm_len, stm_ofs;
257         pdf_token tok;
258         int next;
259         int i, n, c;
260         fz_context *ctx = doc->ctx;
261         pdf_lexbuf *buf = &doc->lexbuf.base;
262
263         fz_var(encrypt);
264         fz_var(id);
265         fz_var(root);
266         fz_var(info);
267         fz_var(list);
268         fz_var(obj);
269
270         if (doc->repair_attempted)
271                 fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
272         doc->repair_attempted = 1;
273
274         doc->dirty = 1;
275         /* Can't support incremental update after repair */
276         doc->freeze_updates = 1;
277
278         fz_seek(doc->file, 0, 0);
279
280         fz_try(ctx)
281         {
282                 pdf_xref_entry *entry;
283                 listlen = 0;
284                 listcap = 1024;
285                 list = fz_malloc_array(ctx, listcap, sizeof(struct entry));
286
287                 /* look for '%PDF' version marker within first kilobyte of file */
288                 n = fz_read(doc->file, (unsigned char *)buf->scratch, fz_mini(buf->size, 1024));
289
290                 fz_seek(doc->file, 0, 0);
291                 for (i = 0; i < n - 4; i++)
292                 {
293                         if (memcmp(&buf->scratch[i], "%PDF", 4) == 0)
294                         {
295                                 fz_seek(doc->file, i + 8, 0); /* skip "%PDF-X.Y" */
296                                 break;
297                         }
298                 }
299
300                 /* skip comment line after version marker since some generators
301                  * forget to terminate the comment with a newline */
302                 c = fz_read_byte(doc->file);
303                 while (c >= 0 && (c == ' ' || c == '%'))
304                         c = fz_read_byte(doc->file);
305                 fz_unread_byte(doc->file);
306
307                 while (1)
308                 {
309                         tmpofs = fz_tell(doc->file);
310                         if (tmpofs < 0)
311                                 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
312
313                         fz_try(ctx)
314                         {
315                                 tok = pdf_lex_no_string(doc->file, buf);
316                         }
317                         fz_catch(ctx)
318                         {
319                                 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
320                                 fz_warn(ctx, "ignoring the rest of the file");
321                                 break;
322                         }
323
324                         /* If we have the next token already, then we'll jump
325                          * back here, rather than going through the top of
326                          * the loop. */
327                 have_next_token:
328
329                         if (tok == PDF_TOK_INT)
330                         {
331                                 if (buf->i < 0)
332                                 {
333                                         num = 0;
334                                         gen = 0;
335                                         continue;
336                                 }
337                                 numofs = genofs;
338                                 num = gen;
339                                 genofs = tmpofs;
340                                 gen = buf->i;
341                         }
342
343                         else if (tok == PDF_TOK_OBJ)
344                         {
345                                 fz_try(ctx)
346                                 {
347                                         stm_len = 0;
348                                         stm_ofs = 0;
349                                         tok = pdf_repair_obj(doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs);
350                                 }
351                                 fz_catch(ctx)
352                                 {
353                                         fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
354                                         /* If we haven't seen a root yet, there is nothing
355                                          * we can do, but give up. Otherwise, we'll make
356                                          * do. */
357                                         if (!root)
358                                                 fz_rethrow(ctx);
359                                         fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
360                                         break;
361                                 }
362
363                                 if (num <= 0 || num > MAX_OBJECT_NUMBER)
364                                 {
365                                         fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
366                                         goto have_next_token;
367                                 }
368
369                                 gen = fz_clampi(gen, 0, 65535);
370
371                                 if (listlen + 1 == listcap)
372                                 {
373                                         listcap = (listcap * 3) / 2;
374                                         list = fz_resize_array(ctx, list, listcap, sizeof(struct entry));
375                                 }
376
377                                 list[listlen].num = num;
378                                 list[listlen].gen = gen;
379                                 list[listlen].ofs = numofs;
380                                 list[listlen].stm_ofs = stm_ofs;
381                                 list[listlen].stm_len = stm_len;
382                                 listlen ++;
383
384                                 if (num > maxnum)
385                                         maxnum = num;
386
387                                 goto have_next_token;
388                         }
389
390                         /* If we find a dictionary it is probably the trailer,
391                          * but could be a stream (or bogus) dictionary caused
392                          * by a corrupt file. */
393                         else if (tok == PDF_TOK_OPEN_DICT)
394                         {
395                                 fz_try(ctx)
396                                 {
397                                         dict = pdf_parse_dict(doc, doc->file, buf);
398                                 }
399                                 fz_catch(ctx)
400                                 {
401                                         fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
402                                         /* If this was the real trailer dict
403                                          * it was broken, in which case we are
404                                          * in trouble. Keep going though in
405                                          * case this was just a bogus dict. */
406                                         continue;
407                                 }
408
409                                 obj = pdf_dict_gets(dict, "Encrypt");
410                                 if (obj)
411                                 {
412                                         pdf_drop_obj(encrypt);
413                                         encrypt = pdf_keep_obj(obj);
414                                 }
415
416                                 obj = pdf_dict_gets(dict, "ID");
417                                 if (obj && (!id || !encrypt || pdf_dict_gets(dict, "Encrypt")))
418                                 {
419                                         pdf_drop_obj(id);
420                                         id = pdf_keep_obj(obj);
421                                 }
422
423                                 obj = pdf_dict_gets(dict, "Root");
424                                 if (obj)
425                                 {
426                                         pdf_drop_obj(root);
427                                         root = pdf_keep_obj(obj);
428                                 }
429
430                                 obj = pdf_dict_gets(dict, "Info");
431                                 if (obj)
432                                 {
433                                         pdf_drop_obj(info);
434                                         info = pdf_keep_obj(obj);
435                                 }
436
437                                 pdf_drop_obj(dict);
438                                 obj = NULL;
439                         }
440
441                         else if (tok == PDF_TOK_EOF)
442                                 break;
443                         else
444                         {
445                                 if (tok == PDF_TOK_ERROR)
446                                         fz_read_byte(doc->file);
447                                 num = 0;
448                                 gen = 0;
449                         }
450
451                 }
452
453                 /* make xref reasonable */
454
455                 /*
456                         Dummy access to entry to assure sufficient space in the xref table
457                         and avoid repeated reallocs in the loop
458                 */
459                 /* Ensure that the first xref table is a 'solid' one from
460                  * 0 to maxnum. */
461                 pdf_ensure_solid_xref(doc, maxnum);
462
463                 for (i = 0; i < listlen; i++)
464                 {
465                         entry = pdf_get_populating_xref_entry(doc, list[i].num);
466                         entry->type = 'n';
467                         entry->ofs = list[i].ofs;
468                         entry->gen = list[i].gen;
469
470                         entry->stm_ofs = list[i].stm_ofs;
471
472                         /* correct stream length for unencrypted documents */
473                         if (!encrypt && list[i].stm_len >= 0)
474                         {
475                                 dict = pdf_load_object(doc, list[i].num, list[i].gen);
476
477                                 length = pdf_new_int(doc, list[i].stm_len);
478                                 pdf_dict_puts(dict, "Length", length);
479                                 pdf_drop_obj(length);
480
481                                 pdf_drop_obj(dict);
482                         }
483                 }
484
485                 entry = pdf_get_populating_xref_entry(doc, 0);
486                 entry->type = 'f';
487                 entry->ofs = 0;
488                 entry->gen = 65535;
489                 entry->stm_ofs = 0;
490
491                 next = 0;
492                 for (i = pdf_xref_len(doc) - 1; i >= 0; i--)
493                 {
494                         entry = pdf_get_populating_xref_entry(doc, i);
495                         if (entry->type == 'f')
496                         {
497                                 entry->ofs = next;
498                                 if (entry->gen < 65535)
499                                         entry->gen ++;
500                                 next = i;
501                         }
502                 }
503
504                 /* create a repaired trailer, Root will be added later */
505
506                 obj = pdf_new_dict(doc, 5);
507                 /* During repair there is only a single xref section */
508                 pdf_set_populating_xref_trailer(doc, obj);
509                 pdf_drop_obj(obj);
510                 obj = NULL;
511
512                 obj = pdf_new_int(doc, maxnum + 1);
513                 pdf_dict_puts(pdf_trailer(doc), "Size", obj);
514                 pdf_drop_obj(obj);
515                 obj = NULL;
516
517                 if (root)
518                 {
519                         pdf_dict_puts(pdf_trailer(doc), "Root", root);
520                         pdf_drop_obj(root);
521                         root = NULL;
522                 }
523                 if (info)
524                 {
525                         pdf_dict_puts(pdf_trailer(doc), "Info", info);
526                         pdf_drop_obj(info);
527                         info = NULL;
528                 }
529
530                 if (encrypt)
531                 {
532                         if (pdf_is_indirect(encrypt))
533                         {
534                                 /* create new reference with non-NULL xref pointer */
535                                 obj = pdf_new_indirect(doc, pdf_to_num(encrypt), pdf_to_gen(encrypt));
536                                 pdf_drop_obj(encrypt);
537                                 encrypt = obj;
538                                 obj = NULL;
539                         }
540                         pdf_dict_puts(pdf_trailer(doc), "Encrypt", encrypt);
541                         pdf_drop_obj(encrypt);
542                         encrypt = NULL;
543                 }
544
545                 if (id)
546                 {
547                         if (pdf_is_indirect(id))
548                         {
549                                 /* create new reference with non-NULL xref pointer */
550                                 obj = pdf_new_indirect(doc, pdf_to_num(id), pdf_to_gen(id));
551                                 pdf_drop_obj(id);
552                                 id = obj;
553                                 obj = NULL;
554                         }
555                         pdf_dict_puts(pdf_trailer(doc), "ID", id);
556                         pdf_drop_obj(id);
557                         id = NULL;
558                 }
559
560                 fz_free(ctx, list);
561         }
562         fz_catch(ctx)
563         {
564                 pdf_drop_obj(encrypt);
565                 pdf_drop_obj(id);
566                 pdf_drop_obj(root);
567                 pdf_drop_obj(obj);
568                 pdf_drop_obj(info);
569                 fz_free(ctx, list);
570                 fz_rethrow(ctx);
571         }
572 }
573
574 void
575 pdf_repair_obj_stms(pdf_document *doc)
576 {
577         fz_context *ctx = doc->ctx;
578         pdf_obj *dict;
579         int i;
580         int xref_len = pdf_xref_len(doc);
581
582         for (i = 0; i < xref_len; i++)
583         {
584                 pdf_xref_entry *entry = pdf_get_populating_xref_entry(doc, i);
585
586                 if (entry->stm_ofs)
587                 {
588                         dict = pdf_load_object(doc, i, 0);
589                         fz_try(ctx)
590                         {
591                                 if (!strcmp(pdf_to_name(pdf_dict_gets(dict, "Type")), "ObjStm"))
592                                         pdf_repair_obj_stm(doc, i, 0);
593                         }
594                         fz_catch(ctx)
595                         {
596                                 fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
597                         }
598                         pdf_drop_obj(dict);
599                 }
600         }
601
602         /* Ensure that streamed objects reside inside a known non-streamed object */
603         for (i = 0; i < xref_len; i++)
604         {
605                 pdf_xref_entry *entry = pdf_get_populating_xref_entry(doc, i);
606
607                 if (entry->type == 'o' && pdf_get_populating_xref_entry(doc, entry->ofs)->type != 'n')
608                         fz_throw(doc->ctx, FZ_ERROR_GENERIC, "invalid reference to non-object-stream: %d (%d 0 R)", entry->ofs, i);
609         }
610 }