• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

adnsistemas / pdf-lib / #18

24 Mar 2026 08:15PM UTC coverage: 74.286% (+0.3%) from 74.001%
#18

push

David N. Abdala
Documentation change

2569 of 3981 branches covered (64.53%)

Branch coverage included in aggregate %.

7372 of 9401 relevant lines covered (78.42%)

297170.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.36
/src/core/parser/PDFParser.ts
1
import PDFCrossRefSection from '../document/PDFCrossRefSection';
54✔
2
import PDFHeader from '../document/PDFHeader';
54✔
3
import PDFTrailer from '../document/PDFTrailer';
54✔
4
import {
54✔
5
  MissingKeywordError,
6
  MissingPDFHeaderError,
7
  PDFInvalidObjectParsingError,
8
  ReparseError,
9
  StalledParserError,
10
} from '../errors';
11
import PDFDict from '../objects/PDFDict';
12
import PDFInvalidObject from '../objects/PDFInvalidObject';
54✔
13
import PDFName from '../objects/PDFName';
54✔
14
import PDFObject from '../objects/PDFObject';
15
import PDFRawStream from '../objects/PDFRawStream';
16
import PDFRef from '../objects/PDFRef';
54✔
17
import ByteStream from './ByteStream';
54✔
18
import PDFObjectParser from './PDFObjectParser';
54✔
19
import PDFObjectStreamParser from './PDFObjectStreamParser';
54✔
20
import PDFXRefStreamParser from './PDFXRefStreamParser';
54✔
21
import PDFContext from '../PDFContext';
54✔
22
import CharCodes from '../syntax/CharCodes';
54✔
23
import { Keywords } from '../syntax/Keywords';
54✔
24
import { IsDigit } from '../syntax/Numeric';
54✔
25
import { waitForTick } from '../../utils';
54✔
26
import { CipherTransformFactory } from '../crypto';
27
import PDFNumber from '../objects/PDFNumber';
54✔
28
import { isPDFInstance, PDFClasses } from '../../api/objects';
54✔
29

30
class PDFParser extends PDFObjectParser {
31
  static forBytesWithOptions = (
54✔
32
    pdfBytes: Uint8Array,
33
    objectsPerTick?: number,
34
    throwOnInvalidObject?: boolean,
35
    warnOnInvalidObjects?: boolean,
36
    capNumbers?: boolean,
37
    cryptoFactory?: CipherTransformFactory,
38
    forIncrementalUpdate?: boolean,
39
    preserveObjectsVersions?: boolean,
40
  ) =>
41
    new PDFParser(
177✔
42
      pdfBytes,
43
      objectsPerTick,
44
      throwOnInvalidObject,
45
      warnOnInvalidObjects,
46
      capNumbers,
47
      cryptoFactory,
48
      forIncrementalUpdate,
49
      preserveObjectsVersions,
50
    );
51

52
  private readonly objectsPerTick: number;
53
  private readonly throwOnInvalidObject: boolean;
54
  private readonly warnOnInvalidObjects: boolean;
55
  private alreadyParsed = false;
177✔
56
  private parsedObjects = 0;
177✔
57

58
  constructor(
59
    pdfBytes: Uint8Array,
60
    objectsPerTick = Infinity,
27✔
61
    throwOnInvalidObject = false,
27✔
62
    warnOnInvalidObjects = false,
176✔
63
    capNumbers = false,
28✔
64
    cryptoFactory?: CipherTransformFactory,
65
    forIncrementalUpdate = false,
27✔
66
    preserveObjectsVersions = false,
27✔
67
  ) {
68
    super(
177✔
69
      ByteStream.of(pdfBytes),
70
      PDFContext.create(preserveObjectsVersions),
71
      capNumbers,
72
      cryptoFactory,
73
    );
74
    this.objectsPerTick = objectsPerTick;
177✔
75
    this.throwOnInvalidObject = throwOnInvalidObject;
177✔
76
    this.warnOnInvalidObjects = warnOnInvalidObjects;
177✔
77
    this.context.isDecrypted = !!cryptoFactory?.encryptionKey;
177✔
78
    this.context.pdfFileDetails.pdfSize = pdfBytes.length;
177✔
79
    if (forIncrementalUpdate) {
177✔
80
      this.context.pdfFileDetails.originalBytes = pdfBytes;
40✔
81
    }
82
  }
83

84
  async parseDocument(): Promise<PDFContext> {
85
    if (this.alreadyParsed) {
178✔
86
      throw new ReparseError('PDFParser', 'parseDocument');
1✔
87
    }
88
    this.alreadyParsed = true;
177✔
89

90
    this.context.header = this.parseHeader();
177✔
91

92
    let prevOffset;
93
    while (!this.bytes.done()) {
176✔
94
      await this.parseDocumentSection();
593✔
95
      const offset = this.bytes.offset();
591✔
96
      if (offset === prevOffset) {
591!
97
        throw new StalledParserError(this.bytes.position());
×
98
      }
99
      prevOffset = offset;
591✔
100
    }
101

102
    this.maybeRecoverRoot();
174✔
103

104
    if (this.context.lookup(PDFRef.of(0))) {
174✔
105
      console.warn('Removing parsed object: 0 0 R');
1✔
106
      this.context.delete(PDFRef.of(0));
1✔
107
    }
108

109
    return this.context;
174✔
110
  }
111

112
  private maybeRecoverRoot(): void {
113
    const isValidCatalog = (obj?: PDFObject) =>
174✔
114
      isPDFInstance(obj, PDFClasses.PDFDict) &&
224✔
115
      (obj as PDFDict).lookup(PDFName.of('Type')) === PDFName.of('Catalog');
116

117
    const catalog = this.context.lookup(this.context.trailerInfo.Root);
174✔
118

119
    if (!isValidCatalog(catalog)) {
174✔
120
      const indirectObjects = this.context.enumerateIndirectObjects();
11✔
121
      for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
11✔
122
        const [ref, object] = indirectObjects[idx];
50✔
123
        if (isValidCatalog(object)) {
50✔
124
          this.context.trailerInfo.Root = ref;
2✔
125
        }
126
      }
127
    }
128
  }
129

130
  private parseHeader(): PDFHeader {
131
    while (!this.bytes.done()) {
177✔
132
      if (this.matchKeyword(Keywords.header)) {
950✔
133
        const major = this.parseRawInt();
176✔
134
        this.bytes.assertNext(CharCodes.Period);
176✔
135
        const minor = this.parseRawInt();
176✔
136
        const header = PDFHeader.forVersion(major, minor);
176✔
137
        this.skipBinaryHeaderComment();
176✔
138
        return header;
176✔
139
      }
140
      this.bytes.next();
774✔
141
    }
142

143
    throw new MissingPDFHeaderError(this.bytes.position());
1✔
144
  }
145

146
  private parseIndirectObjectHeader(): PDFRef {
147
    this.skipWhitespaceAndComments();
137,467✔
148
    const objectNumber = this.parseRawInt();
137,467✔
149

150
    this.skipWhitespaceAndComments();
112,963✔
151
    const generationNumber = this.parseRawInt();
112,963✔
152

153
    this.skipWhitespaceAndComments();
110,121✔
154
    if (!this.matchKeyword(Keywords.obj)) {
110,121✔
155
      throw new MissingKeywordError(this.bytes.position(), Keywords.obj);
3✔
156
    }
157

158
    return PDFRef.of(objectNumber, generationNumber);
110,118✔
159
  }
160

161
  private matchIndirectObjectHeader(): boolean {
162
    const initialOffset = this.bytes.offset();
82,225✔
163
    try {
82,225✔
164
      this.parseIndirectObjectHeader();
82,225✔
165
      return true;
54,880✔
166
    } catch (e) {
167
      this.bytes.moveTo(initialOffset);
27,345✔
168
      return false;
27,345✔
169
    }
170
  }
171

172
  private shouldWaitForTick = () => {
177✔
173
    this.parsedObjects += 1;
123,483✔
174
    return this.parsedObjects % this.objectsPerTick === 0;
123,483✔
175
  };
176

177
  private async parseIndirectObject(): Promise<PDFRef> {
178
    const ref = this.parseIndirectObjectHeader();
55,052✔
179

180
    this.skipWhitespaceAndComments();
55,052✔
181
    const object = this.parseObject(ref);
55,052✔
182

183
    this.skipWhitespaceAndComments();
55,046✔
184
    // if (!this.matchKeyword(Keywords.endobj)) {
185
    // throw new MissingKeywordError(this.bytes.position(), Keywords.endobj);
186
    // }
187

188
    // TODO: Log a warning if this fails...
189
    this.matchKeyword(Keywords.endobj);
55,046✔
190

191
    if (
55,046✔
192
      isPDFInstance(object, PDFClasses.PDFRawStream) &&
82,615✔
193
      (object as PDFRawStream).dict.lookup(PDFName.of('Type')) ===
194
        PDFName.of('ObjStm')
195
    ) {
196
      await PDFObjectStreamParser.forStream(
2,022✔
197
        object as PDFRawStream,
198
        this.shouldWaitForTick,
199
      ).parseIntoContext();
200
    } else if (
53,024✔
201
      isPDFInstance(object, PDFClasses.PDFRawStream) &&
78,571✔
202
      (object as PDFRawStream).dict.lookup(PDFName.of('Type')) ===
203
        PDFName.of('XRef')
204
    ) {
205
      const entries = PDFXRefStreamParser.forStream(
471✔
206
        object as PDFRawStream,
207
      ).parseIntoContext();
208
      if (entries.length) {
471✔
209
        const xref = PDFCrossRefSection.createEmpty();
471✔
210
        for (const entry of entries) {
471✔
211
          if (entry.deleted) xref.addDeletedEntry(entry.ref, entry.offset);
125,558✔
212
          else xref.addEntry(entry.ref, entry.offset);
42,246✔
213
        }
214
        this.context.xrefs.push(xref);
471✔
215
      }
216
    }
217
    // always register the object and the ref, to properly handle object numeration
218
    this.context.assign(ref, object);
55,036✔
219

220
    return ref;
55,036✔
221
  }
222

223
  // TODO: Improve and clean this up
224
  private tryToParseInvalidIndirectObject() {
225
    const startPos = this.bytes.position();
16✔
226

227
    const msg = `Trying to parse invalid object: ${JSON.stringify(startPos)})`;
16✔
228
    if (this.throwOnInvalidObject) throw new Error(msg);
16✔
229
    if (this.warnOnInvalidObjects) console.warn(msg);
14!
230

231
    const ref = this.parseIndirectObjectHeader();
14✔
232

233
    if (this.warnOnInvalidObjects) console.warn(`Invalid object ref: ${ref}`);
14!
234

235
    this.skipWhitespaceAndComments();
14✔
236
    const start = this.bytes.offset();
14✔
237

238
    let failed = true;
14✔
239
    while (!this.bytes.done()) {
14✔
240
      if (this.matchKeyword(Keywords.endobj)) {
9,961✔
241
        failed = false;
14✔
242
      }
243
      if (!failed) break;
9,961✔
244
      this.bytes.next();
9,947✔
245
    }
246

247
    if (failed) throw new PDFInvalidObjectParsingError(startPos);
14!
248

249
    const end = this.bytes.offset() - Keywords.endobj.length;
14✔
250

251
    const object = PDFInvalidObject.of(this.bytes.slice(start, end));
14✔
252
    this.context.assign(ref, object);
14✔
253

254
    return ref;
14✔
255
  }
256

257
  private async parseIndirectObjects(): Promise<void> {
258
    this.skipWhitespaceAndComments();
593✔
259

260
    while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
593✔
261
      const initialOffset = this.bytes.offset();
55,052✔
262

263
      try {
55,052✔
264
        await this.parseIndirectObject();
55,052✔
265
      } catch (e) {
266
        // TODO: Add tracing/logging mechanism to track when this happens!
267
        this.bytes.moveTo(initialOffset);
16✔
268
        this.tryToParseInvalidIndirectObject();
16✔
269
      }
270
      this.skipWhitespaceAndComments();
55,050✔
271

272
      // TODO: Can this be done only when needed, to avoid harming performance?
273
      this.skipJibberish();
55,050✔
274

275
      if (this.shouldWaitForTick()) await waitForTick();
55,050✔
276
    }
277
  }
278

279
  private maybeParseCrossRefSection(): PDFCrossRefSection | void {
280
    this.skipWhitespaceAndComments();
591✔
281
    if (!this.matchKeyword(Keywords.xref)) return;
591✔
282
    this.skipWhitespaceAndComments();
124✔
283

284
    let objectNumber = -1;
124✔
285
    const xref = PDFCrossRefSection.createEmpty();
124✔
286

287
    while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
124✔
288
      const firstInt = this.parseRawInt();
111,752✔
289
      this.skipWhitespaceAndComments();
111,752✔
290

291
      // Check if second digit is valid integer
292
      if (!IsDigit[this.bytes.peek()]) {
111,752!
293
        return PDFCrossRefSection.createEmpty();
×
294
      }
295

296
      const secondInt = this.parseRawInt();
111,752✔
297
      this.skipWhitespaceAndComments();
111,752✔
298

299
      const byte = this.bytes.peek();
111,752✔
300
      if (byte === CharCodes.n || byte === CharCodes.f) {
111,752✔
301
        const ref = PDFRef.of(objectNumber, secondInt);
111,446✔
302
        if (this.bytes.next() === CharCodes.n) {
111,446✔
303
          xref.addEntry(ref, firstInt);
46,524✔
304
        } else {
305
          // this.context.delete(ref);
306
          xref.addDeletedEntry(ref, firstInt);
64,922✔
307
        }
308
        objectNumber += 1;
111,446✔
309
      } else {
310
        objectNumber = firstInt;
306✔
311
      }
312
      this.skipWhitespaceAndComments();
111,752✔
313
    }
314

315
    return xref;
124✔
316
  }
317

318
  private maybeParseTrailerDict(): void {
319
    this.skipWhitespaceAndComments();
591✔
320
    if (!this.matchKeyword(Keywords.trailer)) return;
591✔
321
    this.skipWhitespaceAndComments();
124✔
322

323
    const dict = this.parseDict();
124✔
324

325
    const { context } = this;
124✔
326
    context.trailerInfo = {
124✔
327
      Size:
328
        dict.lookupMaybe(PDFName.of('Size'), PDFNumber) ||
129✔
329
        context.trailerInfo.Size,
330
      Root: dict.get(PDFName.of('Root')) || context.trailerInfo.Root,
133✔
331
      Encrypt: dict.get(PDFName.of('Encrypt')) || context.trailerInfo.Encrypt,
243✔
332
      Info: dict.get(PDFName.of('Info')) || context.trailerInfo.Info,
134✔
333
      ID: dict.get(PDFName.of('ID')) || context.trailerInfo.ID,
188✔
334
    };
335
    // if open for incremental update, then deleted objects need to be preserved, and largestObjectNumber has to be Size-1
336
    if (context.trailerInfo.Size && context.pdfFileDetails.originalBytes)
124✔
337
      context.largestObjectNumber = context.trailerInfo.Size.asNumber() - 1;
37✔
338
  }
339

340
  private maybeParseTrailer(): PDFTrailer | void {
341
    this.skipWhitespaceAndComments();
591✔
342
    if (!this.matchKeyword(Keywords.startxref)) return;
591✔
343
    this.skipWhitespaceAndComments();
581✔
344

345
    const offset = this.parseRawInt();
581✔
346
    this.context.pdfFileDetails.prevStartXRef = offset;
581✔
347

348
    this.skipWhitespace();
581✔
349
    this.matchKeyword(Keywords.eof);
581✔
350
    this.skipWhitespaceAndComments();
581✔
351
    this.matchKeyword(Keywords.eof);
581✔
352
    this.skipWhitespaceAndComments();
581✔
353

354
    return PDFTrailer.forLastCrossRefSectionOffset(offset);
581✔
355
  }
356

357
  private async parseDocumentSection(): Promise<void> {
358
    await this.parseIndirectObjects();
593✔
359
    const xref = this.maybeParseCrossRefSection();
591✔
360
    if (xref) this.context.xrefs.push(xref);
591✔
361
    this.maybeParseTrailerDict();
591✔
362
    this.maybeParseTrailer();
591✔
363

364
    // TODO: Can this be done only when needed, to avoid harming performance?
365
    this.skipJibberish();
591✔
366
  }
367

368
  /**
369
   * This operation is not necessary for valid PDF files. But some invalid PDFs
370
   * contain jibberish in between indirect objects. This method is designed to
371
   * skip past that jibberish, should it exist, until it reaches the next
372
   * indirect object header, an xref table section, or the file trailer.
373
   */
374
  private skipJibberish(): void {
375
    this.skipWhitespaceAndComments();
55,641✔
376
    while (!this.bytes.done()) {
55,641✔
377
      const initialOffset = this.bytes.offset();
128,815✔
378
      const byte = this.bytes.peek();
128,815✔
379
      const isAlphaNumeric = byte >= CharCodes.Space && byte <= CharCodes.Tilde;
128,815✔
380
      if (isAlphaNumeric) {
128,815✔
381
        if (
82,806✔
382
          this.matchKeyword(Keywords.xref) ||
330,395✔
383
          this.matchKeyword(Keywords.trailer) ||
384
          this.matchKeyword(Keywords.startxref) ||
385
          this.matchIndirectObjectHeader()
386
        ) {
387
          this.bytes.moveTo(initialOffset);
55,461✔
388
          break;
55,461✔
389
        }
390
      }
391
      this.bytes.next();
73,354✔
392
    }
393
  }
394

395
  /**
396
   * Skips the binary comment following a PDF header. The specification
397
   * defines this binary comment (section 7.5.2 File Header) as a sequence of 4
398
   * or more bytes that are 128 or greater, and which are preceded by a "%".
399
   *
400
   * This would imply that to strip out this binary comment, we could check for
401
   * a sequence of bytes starting with "%", and remove all subsequent bytes that
402
   * are 128 or greater. This works for many documents that properly comply with
403
   * the spec. But in the wild, there are PDFs that omit the leading "%", and
404
   * include bytes that are less than 128 (e.g. 0 or 1). So in order to parse
405
   * these headers correctly, we just throw out all bytes leading up to the
406
   * first indirect object header.
407
   */
408
  private skipBinaryHeaderComment(): void {
409
    this.skipWhitespaceAndComments();
176✔
410
    try {
176✔
411
      const initialOffset = this.bytes.offset();
176✔
412
      this.parseIndirectObjectHeader();
176✔
413
      this.bytes.moveTo(initialOffset);
172✔
414
    } catch (e) {
415
      this.bytes.next();
4✔
416
      this.skipWhitespaceAndComments();
4✔
417
    }
418
  }
419
}
420

421
export default PDFParser;
54✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc