• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

get-set-fetch / scraper / 4403974148

pending completion
4403974148

push

github

Andrei Sabau
dependencies update

655 of 852 branches covered (76.88%)

Branch coverage included in aggregate %.

1574 of 1796 relevant lines covered (87.64%)

1266.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

58.49
/src/plugins/default/BrowserFetchPlugin.ts
1
import { SchemaType } from '../../schema/SchemaHelper';
2
import Project from '../../storage/base/Project';
3
import Resource from '../../storage/base/Resource';
4
import BrowserClient from '../../browserclient/BrowserClient';
5
import { DomStabilityStatus, waitForDomStability } from '../dom-utils';
1✔
6
import * as MimeTypes from '../../export/MimeTypes.json';
1✔
7
import { getLogger } from '../../logger/Logger';
1✔
8
import BaseFetchPlugin, { FetchError } from './BaseFetchPlugin';
1✔
9

10
/** Opens html resources in a browser tab. Downloads binary resources. */
11
export default class BrowserFetchPlugin extends BaseFetchPlugin {
1✔
12
  static get schema() {
13
    return {
3✔
14
      type: 'object',
15
      title: 'Browser Fetch Plugin',
16
      description: 'depending on resource type (binary, html), either downloads or opens in the scrape tab the resource url.',
17
      properties: {
18
        gotoOptions: {
19
          type: 'object',
20
          description: 'navigation parameters',
21
          properties: {
22
            timeout: {
23
              description: 'maximum navigation time in milliseconds',
24
              type: 'integer',
25
              default: 30000,
26
            },
27
            waitUntil: {
28
              description: 'when to consider navigation succeeded',
29
              type: 'string',
30
              default: 'domcontentloaded',
31
            },
32
          },
33
        },
34
        stabilityCheck: {
35
          type: 'integer',
36
          default: 0,
37
          title: 'Stability Check',
38
          description: 'Considers the page loaded and ready to be scraped when there are no more DOM changes within the specified amount of time (milliseconds). Only applies to html resources. Useful for bypassing preloader content.',
39
        },
40
        stabilityTimeout: {
41
          type: 'integer',
42
          default: 0,
43
          title: 'Max Stability Waiting Time',
44
          description: 'Maximum waiting time (miliseconds) for achieving DOM stability in case of a continuously updated DOM (ex: timers, countdowns).',
45
        },
46
      },
47
    } as const;
48
  }
49

50
  logger = getLogger('BrowserFetchPlugin');
3✔
51
  opts: SchemaType<typeof BrowserFetchPlugin.schema>;
52

53
  constructor(opts: SchemaType<typeof BrowserFetchPlugin.schema> = {}) {
2✔
54
    super(opts);
3✔
55
  }
56

57
  async apply(project: Project, resource: Resource, client: BrowserClient): Promise<Partial<Resource>> {
58
    let result: Partial<Resource>;
59

60
    try {
1✔
61
      // url is of html mime type, loaded it in a browser tab
62
      if (await this.isHtml(resource, client)) {
1!
63
        this.logger.debug('resource determined to be html');
1✔
64
        result = await this.openInTab(resource, client);
1✔
65
      }
66
      /*
67
      url is of non html mime type,
68
      download it and store it as Uint8Array compatible with both nodejs and browser env
69
      */
70
      else {
71
        result = await this.fetch(resource, client);
×
72
      }
73
    }
74
    catch (err) {
75
      return this.fetchErrResult(err);
1✔
76
    }
77

78
    return result;
×
79
  }
80

81
  // fetch resource via builtin fetch
82
  async fetch(resource: Resource, client: BrowserClient, opts: RequestInit = {}): Promise<Partial<Resource>> {
×
83
    /*
84
    trying to load a resource from a different domain, CORS is in effect
85
    open the external url in a new browser tab
86
    only afterwards attempt to fetch it now that we're on the same domain
87
    this will request the resource twice, hopefully the 2nd time will be cached ...
88
    open just the external hostname as the full external url may trigger a browser download, not supported in chrome headless
89
    */
90

91
    if (this.isCorsActive(client.getUrl(), resource.url)) {
×
92
      await client.goto(new URL('/', resource.url).toString(), { waitUntil: 'load' });
×
93
    }
94

95
    const { binaryString, headers, status, redirected, url }:
96
    { binaryString: string, headers: { [key: string]: string }, status: number, redirected: boolean, url: string } = await client.evaluate(
×
97
      ({ url, opts }: { url: string, opts: RequestInit }) => new Promise(async (resolve, reject) => {
×
98
        try {
×
99
          const response = await fetch(url, { method: 'GET', credentials: 'include', ...opts });
×
100
          const { status, headers, redirected, url: finalUrl } = response;
×
101

102
          // Headers instance toJSON() produces an empty obj, manually serialize
103
          const headerObj = Array.from(headers.keys()).reduce(
×
104
            (acc, k) => Object.assign(acc, { [k.toLowerCase()]: headers.get(k) }),
×
105
            {},
106
          );
107
          const isHtml = /html/.test(headerObj['content-type']);
×
108

109
          if (!isHtml) {
×
110
            const blob = await response.blob();
×
111
            const reader = new FileReader();
×
112
            reader.readAsBinaryString(blob);
×
113
            reader.onload = () => {
×
114
              resolve({ binaryString: reader.result, headers: headerObj, status, redirected, url: finalUrl });
×
115
            };
116
            reader.onerror = () => {
×
117
              throw Error('error reading binary string');
×
118
            };
119
          }
120
          else {
121
            resolve({ headers: headerObj, status, redirected, url: finalUrl });
×
122
          }
123
        }
124
        catch (err) {
125
          reject(err);
×
126
        }
127
      }),
128
      { url: resource.url, opts },
129
    );
130

131
    this.logger.trace({ redirected, status, headers }, 'retrieved fetch headers');
×
132

133
    // don't have access to initial redirect status can't chain back to the original redirect one, always put 301
134
    if (redirected) {
×
135
      throw new FetchError(301, url);
×
136
    }
137

138
    // don't proceed further unless we have a valid status
139
    if (!this.isValidStatus(status)) {
×
140
      throw new FetchError(status);
×
141
    }
142

143
    const result: Partial<Resource> = {
×
144
      status,
145
      contentType: headers['content-type'],
146
    };
147

148
    if (binaryString) {
×
149
      result.data = Buffer.from(binaryString, 'binary');
×
150
    }
151

152
    return result;
×
153
  }
154

155
  async openInTab(resource: Resource, client: BrowserClient): Promise<Partial<Resource>> {
156
    const response = await client.goto(resource.url, this.opts.gotoOptions);
5✔
157
    const redirectResponse = await client.getRedirectResponse(response.request());
5✔
158

159
    this.logger.debug({ status: response.status() }, 'openInTab response');
5✔
160

161
    /*
162
    if the url has no extension, fetch HEADER was invoked to determine contentType, opening the html resource in tab will result in 304
163
    add the extra status to the allowed ones
164
    */
165
    if (!this.isValidStatus(response.status(), [ 304 ])) {
5✔
166
      throw new FetchError(response.status());
1✔
167
    }
168

169
    // what follows is status 2xx handling
170
    const contentType: string = await client.evaluate(() => document.contentType);
4✔
171

172
    if (/html/.test(contentType) && this.opts.stabilityCheck > 0) {
4✔
173
      const stabilityStatus: DomStabilityStatus = await client.evaluate(waitForDomStability, { stabilityCheck: this.opts.stabilityCheck, stabilityTimeout: this.opts.stabilityTimeout });
3✔
174
      if (stabilityStatus === DomStabilityStatus.Unstable) {
3✔
175
        throw new Error(`DOM not stable after stabilityTimeout of ${this.opts.stabilityTimeout}`);
1✔
176
      }
177
    }
178

179
    const result: Partial<Resource> = {
3✔
180
      status: response.status(),
181
      contentType,
182
    };
183

184
    /*
185
    both puppeteer and playwright follow redirects automatically
186
    puppeteer can control/abort redirects via page.setRequestInterception
187
    playwright can't: https://github.com/microsoft/playwright/issues/3993
188
    the redirect origin needs to be saved as an already scraped queue entry so we don't keep visiting it
189
    current resource will keep its initial url
190
    a new queue entry will be generated with last redirect location
191
    */
192
    if (redirectResponse) {
3✔
193
      throw new FetchError(redirectResponse.status(), response.url());
2✔
194
    }
195

196
    return result;
1✔
197
  }
198

199
  isCorsActive(originUrl: string, toBeFetchedUrl: string): boolean {
200
    return new URL(originUrl).hostname !== new URL(toBeFetchedUrl).hostname;
2✔
201
  }
202

203
  getExtension(urlStr: string) {
204
    const { pathname } = new URL(urlStr);
9✔
205
    const extensionMatch = /^.*\.(.+)$/.exec(pathname);
9✔
206

207
    return extensionMatch ? extensionMatch[1] : null;
9✔
208
  }
209

210
  async isHtml(resource: Resource, client: BrowserClient): Promise<boolean> {
211
    const ext = this.getExtension(resource.url);
6✔
212

213
    let isHtml: boolean;
214

215
    // try to determine if resource is scrapable (html content) based on extension type
216
    if (ext) {
6✔
217
      if (/htm/.test(ext)) {
4✔
218
        isHtml = true;
3✔
219
      }
220
      else if (Object.values(MimeTypes).includes(ext)) {
1!
221
        isHtml = false;
1✔
222
      }
223
    }
224

225
    // extension type is missing from url or not present in the list of registered MimeTypes
226
    if (isHtml === undefined) {
6✔
227
      // just fetch the headers, returned contentType will be used to determine if resource is an html one
228
      const { contentType } = await this.fetch(resource, client, { method: 'HEAD' });
2✔
229
      isHtml = /htm/.test(contentType);
2✔
230
    }
231

232
    return isHtml;
6✔
233
  }
234
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc