adnsistemas / pdf-lib / #18

Committed 24 Mar 2026 08:15PM UTC coverage: 74.286% (+0.3%) from 74.001%

Build # #18

Build Type

push

Committed by David N. Abdala

Commit Message

Documentation change

Coverage Stats

2569 of 3981 branches covered (64.53%)

Branch coverage included in aggregate %.

7372 of 9401 relevant lines covered (78.42%)

297170.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.36

/src/core/parser/PDFParser.ts

import PDFCrossRefSection from '../document/PDFCrossRefSection';
import PDFHeader from '../document/PDFHeader';
import PDFTrailer from '../document/PDFTrailer';
import {
  MissingKeywordError,
  MissingPDFHeaderError,
  PDFInvalidObjectParsingError,
  ReparseError,
  StalledParserError,
} from '../errors';
import PDFDict from '../objects/PDFDict';
import PDFInvalidObject from '../objects/PDFInvalidObject';
import PDFName from '../objects/PDFName';
import PDFObject from '../objects/PDFObject';
import PDFRawStream from '../objects/PDFRawStream';
import PDFRef from '../objects/PDFRef';
import ByteStream from './ByteStream';
import PDFObjectParser from './PDFObjectParser';
import PDFObjectStreamParser from './PDFObjectStreamParser';
import PDFXRefStreamParser from './PDFXRefStreamParser';
import PDFContext from '../PDFContext';
import CharCodes from '../syntax/CharCodes';
import { Keywords } from '../syntax/Keywords';
import { IsDigit } from '../syntax/Numeric';
import { waitForTick } from '../../utils';
import { CipherTransformFactory } from '../crypto';
import PDFNumber from '../objects/PDFNumber';
import { isPDFInstance, PDFClasses } from '../../api/objects';

class PDFParser extends PDFObjectParser {
  static forBytesWithOptions = (
    pdfBytes: Uint8Array,
    objectsPerTick?: number,
    throwOnInvalidObject?: boolean,
    warnOnInvalidObjects?: boolean,
    capNumbers?: boolean,
    cryptoFactory?: CipherTransformFactory,
    forIncrementalUpdate?: boolean,
    preserveObjectsVersions?: boolean,
  ) =>
    new PDFParser(
      pdfBytes,
      objectsPerTick,
      throwOnInvalidObject,
      warnOnInvalidObjects,
      capNumbers,
      cryptoFactory,
      forIncrementalUpdate,
      preserveObjectsVersions,
    );

  private readonly objectsPerTick: number;
  private readonly throwOnInvalidObject: boolean;
  private readonly warnOnInvalidObjects: boolean;
  private alreadyParsed = false;
  private parsedObjects = 0;

  constructor(
    pdfBytes: Uint8Array,
    objectsPerTick = Infinity,
    throwOnInvalidObject = false,
    warnOnInvalidObjects = false,
    capNumbers = false,
    cryptoFactory?: CipherTransformFactory,
    forIncrementalUpdate = false,
    preserveObjectsVersions = false,
  ) {
    super(
      ByteStream.of(pdfBytes),
      PDFContext.create(preserveObjectsVersions),
      capNumbers,
      cryptoFactory,
    );
    this.objectsPerTick = objectsPerTick;
    this.throwOnInvalidObject = throwOnInvalidObject;
    this.warnOnInvalidObjects = warnOnInvalidObjects;
    this.context.isDecrypted = !!cryptoFactory?.encryptionKey;
    this.context.pdfFileDetails.pdfSize = pdfBytes.length;
    if (forIncrementalUpdate) {
      this.context.pdfFileDetails.originalBytes = pdfBytes;
    }
  }

  async parseDocument(): Promise<PDFContext> {
    if (this.alreadyParsed) {
      throw new ReparseError('PDFParser', 'parseDocument');
    }
    this.alreadyParsed = true;

    this.context.header = this.parseHeader();

    let prevOffset;
    while (!this.bytes.done()) {
      await this.parseDocumentSection();
      const offset = this.bytes.offset();
      if (offset === prevOffset) {
        throw new StalledParserError(this.bytes.position());
      }
      prevOffset = offset;
    }

    this.maybeRecoverRoot();

    if (this.context.lookup(PDFRef.of(0))) {
      console.warn('Removing parsed object: 0 0 R');
      this.context.delete(PDFRef.of(0));
    }

    return this.context;
  }

  private maybeRecoverRoot(): void {
    const isValidCatalog = (obj?: PDFObject) =>
      isPDFInstance(obj, PDFClasses.PDFDict) &&
      (obj as PDFDict).lookup(PDFName.of('Type')) === PDFName.of('Catalog');

    const catalog = this.context.lookup(this.context.trailerInfo.Root);

    if (!isValidCatalog(catalog)) {
      const indirectObjects = this.context.enumerateIndirectObjects();
      for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
        const [ref, object] = indirectObjects[idx];
        if (isValidCatalog(object)) {
          this.context.trailerInfo.Root = ref;
        }
      }
    }
  }

  private parseHeader(): PDFHeader {
    while (!this.bytes.done()) {
      if (this.matchKeyword(Keywords.header)) {
        const major = this.parseRawInt();
        this.bytes.assertNext(CharCodes.Period);
        const minor = this.parseRawInt();
        const header = PDFHeader.forVersion(major, minor);
        this.skipBinaryHeaderComment();
        return header;
      }
      this.bytes.next();
    }

    throw new MissingPDFHeaderError(this.bytes.position());
  }

  private parseIndirectObjectHeader(): PDFRef {
    this.skipWhitespaceAndComments();
    const objectNumber = this.parseRawInt();

    this.skipWhitespaceAndComments();
    const generationNumber = this.parseRawInt();

    this.skipWhitespaceAndComments();
    if (!this.matchKeyword(Keywords.obj)) {
      throw new MissingKeywordError(this.bytes.position(), Keywords.obj);
    }

    return PDFRef.of(objectNumber, generationNumber);
  }

  private matchIndirectObjectHeader(): boolean {
    const initialOffset = this.bytes.offset();
    try {
      this.parseIndirectObjectHeader();
      return true;
    } catch (e) {
      this.bytes.moveTo(initialOffset);
      return false;
    }
  }

  private shouldWaitForTick = () => {
    this.parsedObjects += 1;
    return this.parsedObjects % this.objectsPerTick === 0;
  };

  private async parseIndirectObject(): Promise<PDFRef> {
    const ref = this.parseIndirectObjectHeader();

    this.skipWhitespaceAndComments();
    const object = this.parseObject(ref);

    this.skipWhitespaceAndComments();
    // if (!this.matchKeyword(Keywords.endobj)) {
    // throw new MissingKeywordError(this.bytes.position(), Keywords.endobj);
    // }

    // TODO: Log a warning if this fails...
    this.matchKeyword(Keywords.endobj);

    if (
      isPDFInstance(object, PDFClasses.PDFRawStream) &&
      (object as PDFRawStream).dict.lookup(PDFName.of('Type')) ===
        PDFName.of('ObjStm')
    ) {
      await PDFObjectStreamParser.forStream(
        object as PDFRawStream,
        this.shouldWaitForTick,
      ).parseIntoContext();
    } else if (
      isPDFInstance(object, PDFClasses.PDFRawStream) &&
      (object as PDFRawStream).dict.lookup(PDFName.of('Type')) ===
        PDFName.of('XRef')
    ) {
      const entries = PDFXRefStreamParser.forStream(
        object as PDFRawStream,
      ).parseIntoContext();
      if (entries.length) {
        const xref = PDFCrossRefSection.createEmpty();
        for (const entry of entries) {
          if (entry.deleted) xref.addDeletedEntry(entry.ref, entry.offset);
          else xref.addEntry(entry.ref, entry.offset);
        }
        this.context.xrefs.push(xref);
      }
    }
    // always register the object and the ref, to properly handle object numeration
    this.context.assign(ref, object);

    return ref;
  }

  // TODO: Improve and clean this up
  private tryToParseInvalidIndirectObject() {
    const startPos = this.bytes.position();

    const msg = `Trying to parse invalid object: ${JSON.stringify(startPos)})`;
    if (this.throwOnInvalidObject) throw new Error(msg);
    if (this.warnOnInvalidObjects) console.warn(msg);

    const ref = this.parseIndirectObjectHeader();

    if (this.warnOnInvalidObjects) console.warn(`Invalid object ref: ${ref}`);

    this.skipWhitespaceAndComments();
    const start = this.bytes.offset();

    let failed = true;
    while (!this.bytes.done()) {
      if (this.matchKeyword(Keywords.endobj)) {
        failed = false;
      }
      if (!failed) break;
      this.bytes.next();
    }

    if (failed) throw new PDFInvalidObjectParsingError(startPos);

    const end = this.bytes.offset() - Keywords.endobj.length;

    const object = PDFInvalidObject.of(this.bytes.slice(start, end));
    this.context.assign(ref, object);

    return ref;
  }

  private async parseIndirectObjects(): Promise<void> {
    this.skipWhitespaceAndComments();

    while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
      const initialOffset = this.bytes.offset();

      try {
        await this.parseIndirectObject();
      } catch (e) {
        // TODO: Add tracing/logging mechanism to track when this happens!
        this.bytes.moveTo(initialOffset);
        this.tryToParseInvalidIndirectObject();
      }
      this.skipWhitespaceAndComments();

      // TODO: Can this be done only when needed, to avoid harming performance?
      this.skipJibberish();

      if (this.shouldWaitForTick()) await waitForTick();
    }
  }

  private maybeParseCrossRefSection(): PDFCrossRefSection | void {
    this.skipWhitespaceAndComments();
    if (!this.matchKeyword(Keywords.xref)) return;
    this.skipWhitespaceAndComments();

    let objectNumber = -1;
    const xref = PDFCrossRefSection.createEmpty();

    while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
      const firstInt = this.parseRawInt();
      this.skipWhitespaceAndComments();

      // Check if second digit is valid integer
      if (!IsDigit[this.bytes.peek()]) {
        return PDFCrossRefSection.createEmpty();
      }

      const secondInt = this.parseRawInt();
      this.skipWhitespaceAndComments();

      const byte = this.bytes.peek();
      if (byte === CharCodes.n || byte === CharCodes.f) {
        const ref = PDFRef.of(objectNumber, secondInt);
        if (this.bytes.next() === CharCodes.n) {
          xref.addEntry(ref, firstInt);
        } else {
          // this.context.delete(ref);
          xref.addDeletedEntry(ref, firstInt);
        }
        objectNumber += 1;
      } else {
        objectNumber = firstInt;
      }
      this.skipWhitespaceAndComments();
    }

    return xref;
  }

  private maybeParseTrailerDict(): void {
    this.skipWhitespaceAndComments();
    if (!this.matchKeyword(Keywords.trailer)) return;
    this.skipWhitespaceAndComments();

    const dict = this.parseDict();

    const { context } = this;
    context.trailerInfo = {
      Size:
        dict.lookupMaybe(PDFName.of('Size'), PDFNumber) ||
        context.trailerInfo.Size,
      Root: dict.get(PDFName.of('Root')) || context.trailerInfo.Root,
      Encrypt: dict.get(PDFName.of('Encrypt')) || context.trailerInfo.Encrypt,
      Info: dict.get(PDFName.of('Info')) || context.trailerInfo.Info,
      ID: dict.get(PDFName.of('ID')) || context.trailerInfo.ID,
    };
    // if open for incremental update, then deleted objects need to be preserved, and largestObjectNumber has to be Size-1
    if (context.trailerInfo.Size && context.pdfFileDetails.originalBytes)
      context.largestObjectNumber = context.trailerInfo.Size.asNumber() - 1;
  }

  private maybeParseTrailer(): PDFTrailer | void {
    this.skipWhitespaceAndComments();
    if (!this.matchKeyword(Keywords.startxref)) return;
    this.skipWhitespaceAndComments();

    const offset = this.parseRawInt();
    this.context.pdfFileDetails.prevStartXRef = offset;

    this.skipWhitespace();
    this.matchKeyword(Keywords.eof);
    this.skipWhitespaceAndComments();
    this.matchKeyword(Keywords.eof);
    this.skipWhitespaceAndComments();

    return PDFTrailer.forLastCrossRefSectionOffset(offset);
  }

  private async parseDocumentSection(): Promise<void> {
    await this.parseIndirectObjects();
    const xref = this.maybeParseCrossRefSection();
    if (xref) this.context.xrefs.push(xref);
    this.maybeParseTrailerDict();
    this.maybeParseTrailer();

    // TODO: Can this be done only when needed, to avoid harming performance?
    this.skipJibberish();
  }

  /**
   * This operation is not necessary for valid PDF files. But some invalid PDFs
   * contain jibberish in between indirect objects. This method is designed to
   * skip past that jibberish, should it exist, until it reaches the next
   * indirect object header, an xref table section, or the file trailer.
   */
  private skipJibberish(): void {
    this.skipWhitespaceAndComments();
    while (!this.bytes.done()) {
      const initialOffset = this.bytes.offset();
      const byte = this.bytes.peek();
      const isAlphaNumeric = byte >= CharCodes.Space && byte <= CharCodes.Tilde;
      if (isAlphaNumeric) {
        if (
          this.matchKeyword(Keywords.xref) ||
          this.matchKeyword(Keywords.trailer) ||
          this.matchKeyword(Keywords.startxref) ||
          this.matchIndirectObjectHeader()
        ) {
          this.bytes.moveTo(initialOffset);
          break;
        }
      }
      this.bytes.next();
    }
  }

  /**
   * Skips the binary comment following a PDF header. The specification
   * defines this binary comment (section 7.5.2 File Header) as a sequence of 4
   * or more bytes that are 128 or greater, and which are preceded by a "%".
   *
   * This would imply that to strip out this binary comment, we could check for
   * a sequence of bytes starting with "%", and remove all subsequent bytes that
   * are 128 or greater. This works for many documents that properly comply with
   * the spec. But in the wild, there are PDFs that omit the leading "%", and
   * include bytes that are less than 128 (e.g. 0 or 1). So in order to parse
   * these headers correctly, we just throw out all bytes leading up to the
   * first indirect object header.
   */
  private skipBinaryHeaderComment(): void {
    this.skipWhitespaceAndComments();
    try {
      const initialOffset = this.bytes.offset();
      this.parseIndirectObjectHeader();
      this.bytes.moveTo(initialOffset);
    } catch (e) {
      this.bytes.next();
      this.skipWhitespaceAndComments();
    }
  }
}

export default PDFParser;

1	import PDFCrossRefSection from '../document/PDFCrossRefSection';	54✔
2	import PDFHeader from '../document/PDFHeader';	54✔
3	import PDFTrailer from '../document/PDFTrailer';	54✔
4	import {	54✔
5	MissingKeywordError,
6	MissingPDFHeaderError,
7	PDFInvalidObjectParsingError,
8	ReparseError,
9	StalledParserError,
10	} from '../errors';
11	import PDFDict from '../objects/PDFDict';
12	import PDFInvalidObject from '../objects/PDFInvalidObject';	54✔
13	import PDFName from '../objects/PDFName';	54✔
14	import PDFObject from '../objects/PDFObject';
15	import PDFRawStream from '../objects/PDFRawStream';
16	import PDFRef from '../objects/PDFRef';	54✔
17	import ByteStream from './ByteStream';	54✔
18	import PDFObjectParser from './PDFObjectParser';	54✔
19	import PDFObjectStreamParser from './PDFObjectStreamParser';	54✔
20	import PDFXRefStreamParser from './PDFXRefStreamParser';	54✔
21	import PDFContext from '../PDFContext';	54✔
22	import CharCodes from '../syntax/CharCodes';	54✔
23	import { Keywords } from '../syntax/Keywords';	54✔
24	import { IsDigit } from '../syntax/Numeric';	54✔
25	import { waitForTick } from '../../utils';	54✔
26	import { CipherTransformFactory } from '../crypto';
27	import PDFNumber from '../objects/PDFNumber';	54✔
28	import { isPDFInstance, PDFClasses } from '../../api/objects';	54✔
29
30	class PDFParser extends PDFObjectParser {
31	static forBytesWithOptions = (	54✔
32	pdfBytes: Uint8Array,
33	objectsPerTick?: number,
34	throwOnInvalidObject?: boolean,
35	warnOnInvalidObjects?: boolean,
36	capNumbers?: boolean,
37	cryptoFactory?: CipherTransformFactory,
38	forIncrementalUpdate?: boolean,
39	preserveObjectsVersions?: boolean,
40	) =>
41	new PDFParser(	177✔
42	pdfBytes,
43	objectsPerTick,
44	throwOnInvalidObject,
45	warnOnInvalidObjects,
46	capNumbers,
47	cryptoFactory,
48	forIncrementalUpdate,
49	preserveObjectsVersions,
50	);
51
52	private readonly objectsPerTick: number;
53	private readonly throwOnInvalidObject: boolean;
54	private readonly warnOnInvalidObjects: boolean;
55	private alreadyParsed = false;	177✔
56	private parsedObjects = 0;	177✔
57
58	constructor(
59	pdfBytes: Uint8Array,
60	objectsPerTick = Infinity,	27✔
61	throwOnInvalidObject = false,	27✔
62	warnOnInvalidObjects = false,	176✔
63	capNumbers = false,	28✔
64	cryptoFactory?: CipherTransformFactory,
65	forIncrementalUpdate = false,	27✔
66	preserveObjectsVersions = false,	27✔
67	) {
68	super(	177✔
69	ByteStream.of(pdfBytes),
70	PDFContext.create(preserveObjectsVersions),
71	capNumbers,
72	cryptoFactory,
73	);
74	this.objectsPerTick = objectsPerTick;	177✔
75	this.throwOnInvalidObject = throwOnInvalidObject;	177✔
76	this.warnOnInvalidObjects = warnOnInvalidObjects;	177✔
77	this.context.isDecrypted = !!cryptoFactory?.encryptionKey;	177✔
78	this.context.pdfFileDetails.pdfSize = pdfBytes.length;	177✔
79	if (forIncrementalUpdate) {	177✔
80	this.context.pdfFileDetails.originalBytes = pdfBytes;	40✔
81	}
82	}
83
84	async parseDocument(): Promise<PDFContext> {
85	if (this.alreadyParsed) {	178✔
86	throw new ReparseError('PDFParser', 'parseDocument');	1✔
87	}
88	this.alreadyParsed = true;	177✔
89
90	this.context.header = this.parseHeader();	177✔
91
92	let prevOffset;
93	while (!this.bytes.done()) {	176✔
94	await this.parseDocumentSection();	593✔
95	const offset = this.bytes.offset();	591✔
96	if (offset === prevOffset) {	591!
97	throw new StalledParserError(this.bytes.position());	×
98	}
99	prevOffset = offset;	591✔
100	}
101
102	this.maybeRecoverRoot();	174✔
103
104	if (this.context.lookup(PDFRef.of(0))) {	174✔
105	console.warn('Removing parsed object: 0 0 R');	1✔
106	this.context.delete(PDFRef.of(0));	1✔
107	}
108
109	return this.context;	174✔
110	}
111
112	private maybeRecoverRoot(): void {
113	const isValidCatalog = (obj?: PDFObject) =>	174✔
114	isPDFInstance(obj, PDFClasses.PDFDict) &&	224✔
115	(obj as PDFDict).lookup(PDFName.of('Type')) === PDFName.of('Catalog');
116
117	const catalog = this.context.lookup(this.context.trailerInfo.Root);	174✔
118
119	if (!isValidCatalog(catalog)) {	174✔
120	const indirectObjects = this.context.enumerateIndirectObjects();	11✔
121	for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {	11✔
122	const [ref, object] = indirectObjects[idx];	50✔
123	if (isValidCatalog(object)) {	50✔
124	this.context.trailerInfo.Root = ref;	2✔
125	}
126	}
127	}
128	}
129
130	private parseHeader(): PDFHeader {
131	while (!this.bytes.done()) {	177✔
132	if (this.matchKeyword(Keywords.header)) {	950✔
133	const major = this.parseRawInt();	176✔
134	this.bytes.assertNext(CharCodes.Period);	176✔
135	const minor = this.parseRawInt();	176✔
136	const header = PDFHeader.forVersion(major, minor);	176✔
137	this.skipBinaryHeaderComment();	176✔
138	return header;	176✔
139	}
140	this.bytes.next();	774✔
141	}
142
143	throw new MissingPDFHeaderError(this.bytes.position());	1✔
144	}
145
146	private parseIndirectObjectHeader(): PDFRef {
147	this.skipWhitespaceAndComments();	137,467✔
148	const objectNumber = this.parseRawInt();	137,467✔
149
150	this.skipWhitespaceAndComments();	112,963✔
151	const generationNumber = this.parseRawInt();	112,963✔
152
153	this.skipWhitespaceAndComments();	110,121✔
154	if (!this.matchKeyword(Keywords.obj)) {	110,121✔
155	throw new MissingKeywordError(this.bytes.position(), Keywords.obj);	3✔
156	}
157
158	return PDFRef.of(objectNumber, generationNumber);	110,118✔
159	}
160
161	private matchIndirectObjectHeader(): boolean {
162	const initialOffset = this.bytes.offset();	82,225✔
163	try {	82,225✔
164	this.parseIndirectObjectHeader();	82,225✔
165	return true;	54,880✔
166	} catch (e) {
167	this.bytes.moveTo(initialOffset);	27,345✔
168	return false;	27,345✔
169	}
170	}
171
172	private shouldWaitForTick = () => {	177✔
173	this.parsedObjects += 1;	123,483✔
174	return this.parsedObjects % this.objectsPerTick === 0;	123,483✔
175	};
176
177	private async parseIndirectObject(): Promise<PDFRef> {
178	const ref = this.parseIndirectObjectHeader();	55,052✔
179
180	this.skipWhitespaceAndComments();	55,052✔
181	const object = this.parseObject(ref);	55,052✔
182
183	this.skipWhitespaceAndComments();	55,046✔
184	// if (!this.matchKeyword(Keywords.endobj)) {
185	// throw new MissingKeywordError(this.bytes.position(), Keywords.endobj);
186	// }
187
188	// TODO: Log a warning if this fails...
189	this.matchKeyword(Keywords.endobj);	55,046✔
190
191	if (	55,046✔
192	isPDFInstance(object, PDFClasses.PDFRawStream) &&	82,615✔
193	(object as PDFRawStream).dict.lookup(PDFName.of('Type')) ===
194	PDFName.of('ObjStm')
195	) {
196	await PDFObjectStreamParser.forStream(	2,022✔
197	object as PDFRawStream,
198	this.shouldWaitForTick,
199	).parseIntoContext();
200	} else if (	53,024✔
201	isPDFInstance(object, PDFClasses.PDFRawStream) &&	78,571✔
202	(object as PDFRawStream).dict.lookup(PDFName.of('Type')) ===
203	PDFName.of('XRef')
204	) {
205	const entries = PDFXRefStreamParser.forStream(	471✔
206	object as PDFRawStream,
207	).parseIntoContext();
208	if (entries.length) {	471✔
209	const xref = PDFCrossRefSection.createEmpty();	471✔
210	for (const entry of entries) {	471✔
211	if (entry.deleted) xref.addDeletedEntry(entry.ref, entry.offset);	125,558✔
212	else xref.addEntry(entry.ref, entry.offset);	42,246✔
213	}
214	this.context.xrefs.push(xref);	471✔
215	}
216	}
217	// always register the object and the ref, to properly handle object numeration
218	this.context.assign(ref, object);	55,036✔
219
220	return ref;	55,036✔
221	}
222
223	// TODO: Improve and clean this up
224	private tryToParseInvalidIndirectObject() {
225	const startPos = this.bytes.position();	16✔
226
227	const msg = `Trying to parse invalid object: ${JSON.stringify(startPos)})`;	16✔
228	if (this.throwOnInvalidObject) throw new Error(msg);	16✔
229	if (this.warnOnInvalidObjects) console.warn(msg);	14!
230
231	const ref = this.parseIndirectObjectHeader();	14✔
232
233	if (this.warnOnInvalidObjects) console.warn(`Invalid object ref: ${ref}`);	14!
234
235	this.skipWhitespaceAndComments();	14✔
236	const start = this.bytes.offset();	14✔
237
238	let failed = true;	14✔
239	while (!this.bytes.done()) {	14✔
240	if (this.matchKeyword(Keywords.endobj)) {	9,961✔
241	failed = false;	14✔
242	}
243	if (!failed) break;	9,961✔
244	this.bytes.next();	9,947✔
245	}
246
247	if (failed) throw new PDFInvalidObjectParsingError(startPos);	14!
248
249	const end = this.bytes.offset() - Keywords.endobj.length;	14✔
250
251	const object = PDFInvalidObject.of(this.bytes.slice(start, end));	14✔
252	this.context.assign(ref, object);	14✔
253
254	return ref;	14✔
255	}
256
257	private async parseIndirectObjects(): Promise<void> {
258	this.skipWhitespaceAndComments();	593✔
259
260	while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {	593✔
261	const initialOffset = this.bytes.offset();	55,052✔
262
263	try {	55,052✔
264	await this.parseIndirectObject();	55,052✔
265	} catch (e) {
266	// TODO: Add tracing/logging mechanism to track when this happens!
267	this.bytes.moveTo(initialOffset);	16✔
268	this.tryToParseInvalidIndirectObject();	16✔
269	}
270	this.skipWhitespaceAndComments();	55,050✔
271
272	// TODO: Can this be done only when needed, to avoid harming performance?
273	this.skipJibberish();	55,050✔
274
275	if (this.shouldWaitForTick()) await waitForTick();	55,050✔
276	}
277	}
278
279	private maybeParseCrossRefSection(): PDFCrossRefSection \| void {
280	this.skipWhitespaceAndComments();	591✔
281	if (!this.matchKeyword(Keywords.xref)) return;	591✔
282	this.skipWhitespaceAndComments();	124✔
283
284	let objectNumber = -1;	124✔
285	const xref = PDFCrossRefSection.createEmpty();	124✔
286
287	while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {	124✔
288	const firstInt = this.parseRawInt();	111,752✔
289	this.skipWhitespaceAndComments();	111,752✔
290
291	// Check if second digit is valid integer
292	if (!IsDigit[this.bytes.peek()]) {	111,752!
293	return PDFCrossRefSection.createEmpty();	×
294	}
295
296	const secondInt = this.parseRawInt();	111,752✔
297	this.skipWhitespaceAndComments();	111,752✔
298
299	const byte = this.bytes.peek();	111,752✔
300	if (byte === CharCodes.n \|\| byte === CharCodes.f) {	111,752✔
301	const ref = PDFRef.of(objectNumber, secondInt);	111,446✔
302	if (this.bytes.next() === CharCodes.n) {	111,446✔
303	xref.addEntry(ref, firstInt);	46,524✔
304	} else {
305	// this.context.delete(ref);
306	xref.addDeletedEntry(ref, firstInt);	64,922✔
307	}
308	objectNumber += 1;	111,446✔
309	} else {
310	objectNumber = firstInt;	306✔
311	}
312	this.skipWhitespaceAndComments();	111,752✔
313	}
314
315	return xref;	124✔
316	}
317
318	private maybeParseTrailerDict(): void {
319	this.skipWhitespaceAndComments();	591✔
320	if (!this.matchKeyword(Keywords.trailer)) return;	591✔
321	this.skipWhitespaceAndComments();	124✔
322
323	const dict = this.parseDict();	124✔
324
325	const { context } = this;	124✔
326	context.trailerInfo = {	124✔
327	Size:
328	dict.lookupMaybe(PDFName.of('Size'), PDFNumber) \|\|	129✔
329	context.trailerInfo.Size,
330	Root: dict.get(PDFName.of('Root')) \|\| context.trailerInfo.Root,	133✔
331	Encrypt: dict.get(PDFName.of('Encrypt')) \|\| context.trailerInfo.Encrypt,	243✔
332	Info: dict.get(PDFName.of('Info')) \|\| context.trailerInfo.Info,	134✔
333	ID: dict.get(PDFName.of('ID')) \|\| context.trailerInfo.ID,	188✔
334	};
335	// if open for incremental update, then deleted objects need to be preserved, and largestObjectNumber has to be Size-1
336	if (context.trailerInfo.Size && context.pdfFileDetails.originalBytes)	124✔
337	context.largestObjectNumber = context.trailerInfo.Size.asNumber() - 1;	37✔
338	}
339
340	private maybeParseTrailer(): PDFTrailer \| void {
341	this.skipWhitespaceAndComments();	591✔
342	if (!this.matchKeyword(Keywords.startxref)) return;	591✔
343	this.skipWhitespaceAndComments();	581✔
344
345	const offset = this.parseRawInt();	581✔
346	this.context.pdfFileDetails.prevStartXRef = offset;	581✔
347
348	this.skipWhitespace();	581✔
349	this.matchKeyword(Keywords.eof);	581✔
350	this.skipWhitespaceAndComments();	581✔
351	this.matchKeyword(Keywords.eof);	581✔
352	this.skipWhitespaceAndComments();	581✔
353
354	return PDFTrailer.forLastCrossRefSectionOffset(offset);	581✔
355	}
356
357	private async parseDocumentSection(): Promise<void> {
358	await this.parseIndirectObjects();	593✔
359	const xref = this.maybeParseCrossRefSection();	591✔
360	if (xref) this.context.xrefs.push(xref);	591✔
361	this.maybeParseTrailerDict();	591✔
362	this.maybeParseTrailer();	591✔
363
364	// TODO: Can this be done only when needed, to avoid harming performance?
365	this.skipJibberish();	591✔
366	}
367
368	/**
369	* This operation is not necessary for valid PDF files. But some invalid PDFs
370	* contain jibberish in between indirect objects. This method is designed to
371	* skip past that jibberish, should it exist, until it reaches the next
372	* indirect object header, an xref table section, or the file trailer.
373	*/
374	private skipJibberish(): void {
375	this.skipWhitespaceAndComments();	55,641✔
376	while (!this.bytes.done()) {	55,641✔
377	const initialOffset = this.bytes.offset();	128,815✔
378	const byte = this.bytes.peek();	128,815✔
379	const isAlphaNumeric = byte >= CharCodes.Space && byte <= CharCodes.Tilde;	128,815✔
380	if (isAlphaNumeric) {	128,815✔
381	if (	82,806✔
382	this.matchKeyword(Keywords.xref) \|\|	330,395✔
383	this.matchKeyword(Keywords.trailer) \|\|
384	this.matchKeyword(Keywords.startxref) \|\|
385	this.matchIndirectObjectHeader()
386	) {
387	this.bytes.moveTo(initialOffset);	55,461✔
388	break;	55,461✔
389	}
390	}
391	this.bytes.next();	73,354✔
392	}
393	}
394
395	/**
396	* Skips the binary comment following a PDF header. The specification
397	* defines this binary comment (section 7.5.2 File Header) as a sequence of 4
398	* or more bytes that are 128 or greater, and which are preceded by a "%".
399	*
400	* This would imply that to strip out this binary comment, we could check for
401	* a sequence of bytes starting with "%", and remove all subsequent bytes that
402	* are 128 or greater. This works for many documents that properly comply with
403	* the spec. But in the wild, there are PDFs that omit the leading "%", and
404	* include bytes that are less than 128 (e.g. 0 or 1). So in order to parse
405	* these headers correctly, we just throw out all bytes leading up to the
406	* first indirect object header.
407	*/
408	private skipBinaryHeaderComment(): void {
409	this.skipWhitespaceAndComments();	176✔
410	try {	176✔
411	const initialOffset = this.bytes.offset();	176✔
412	this.parseIndirectObjectHeader();	176✔
413	this.bytes.moveTo(initialOffset);	172✔
414	} catch (e) {
415	this.bytes.next();	4✔
416	this.skipWhitespaceAndComments();	4✔
417	}
418	}
419	}
420
421	export default PDFParser;	54✔

adnsistemas / pdf-lib / #18

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous