• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

microlinkhq / html-get / 15533305734

09 Jun 2025 11:19AM UTC coverage: 98.792%. Remained the same
15533305734

push

github

web-flow
build(deps): bump @metascraper/helpers from 5.46.18 to 5.47.1 (#221)

Bumps [@metascraper/helpers](https://github.com/microlinkhq/metascraper/tree/HEAD/packages/metascraper-helpers) from 5.46.18 to 5.47.1.
- [Release notes](https://github.com/microlinkhq/metascraper/releases)
- [Changelog](https://github.com/microlinkhq/metascraper/blob/master/packages/metascraper-helpers/CHANGELOG.md)
- [Commits](https://github.com/microlinkhq/metascraper/commits/v5.47.1/packages/metascraper-helpers)

---
updated-dependencies:
- dependency-name: "@metascraper/helpers"
  dependency-version: 5.47.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Kiko Beats <josefrancisco.verdu@gmail.com>

140 of 147 branches covered (95.24%)

Branch coverage included in aggregate %.

596 of 598 relevant lines covered (99.67%)

25.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.36
/src/html.js
1
'use strict'
13✔
2

13✔
3
const debug = require('debug-logfmt')('html-get:rewrite')
13✔
4
const { get, castArray, forEach } = require('lodash')
13✔
5
const isLocalAddress = require('is-local-address')
13✔
6
const { TAGS: URL_TAGS } = require('html-urls')
13✔
7
const isHTML = require('is-html-content')
13✔
8
const cssUrl = require('css-url-regex')
13✔
9
const execall = require('execall')
13✔
10
const cheerio = require('cheerio')
13✔
11
const { URL } = require('url')
13✔
12
const path = require('path')
13✔
13

13✔
14
const {
13✔
15
  date: toDate,
13✔
16
  isMime,
13✔
17
  isUrl,
13✔
18
  mimeExtension,
13✔
19
  parseUrl
13✔
20
} = require('@metascraper/helpers')
13✔
21

13✔
22
const { getContentType, getCharset } = require('./util')
13✔
23

13✔
24
const has = el => el.length !== 0
13✔
25

13✔
26
const upsert = (el, collection, item) => !has(el) && collection.push(item)
13✔
27

13✔
28
/**
13✔
29
 * Infer timestamp from `last-modified`, `date`, or `age` response headers.
13✔
30
 */
13✔
31
const getDate = headers => {
13✔
32
  const timestamp = get(headers, 'last-modified') || get(headers, 'date')
75✔
33
  return timestamp
75✔
34
    ? toDate(timestamp)
75✔
35
    : toDate(Date.now() - Number(get(headers, 'age')) * 1000)
75✔
36
}
75✔
37

13✔
38
const addHead = ({ $, url, headers }) => {
13✔
39
  const tags = []
71✔
40
  const charset = getCharset(headers)
71✔
41
  const date = getDate(headers)
71✔
42
  const { domain } = parseUrl(url)
71✔
43
  const head = $('head')
71✔
44

71✔
45
  upsert(head.find('title'), tags, `<title>${path.basename(url)}</title>`)
71✔
46

71✔
47
  if (domain) {
71✔
48
    upsert(
59✔
49
      head.find('meta[property="og:site_name"]'),
59✔
50
      tags,
59✔
51
      `<meta property="og:site_name" content="${domain}">`
59✔
52
    )
59✔
53
  }
59✔
54

71✔
55
  if (date) {
71✔
56
    upsert(
32✔
57
      head.find('meta[property="article:published_time"]'),
32✔
58
      tags,
32✔
59
      `<meta name="date" content="${date}" />`
32✔
60
    )
32✔
61
  }
32✔
62

71✔
63
  upsert(
71✔
64
    head.find('link[rel="canonical"]'),
71✔
65
    tags,
71✔
66
    `<link rel="canonical" href="${url}">`
71✔
67
  )
71✔
68

71✔
69
  if (charset) {
71✔
70
    upsert(head.find('meta[charset]'), tags, `<meta charset="${charset}">`)
26✔
71
  }
26✔
72

71✔
73
  tags.forEach(tag => head.append(tag))
71✔
74
}
71✔
75

13✔
76
const addBody = ({ url, headers, html }) => {
13✔
77
  const contentType = getContentType(headers)
18✔
78
  let element = ''
18✔
79

18✔
80
  if (isMime(contentType, 'image')) {
18✔
81
    element = `<img src="${url}"></img>`
3✔
82
  } else if (isMime(contentType, 'video')) {
18✔
83
    element = `<video><source src="${url}" type="${contentType}"></source></video>`
2✔
84
  } else if (isMime(contentType, 'audio')) {
15✔
85
    element = `<audio><source src="${url}" type="${contentType}"></source></audio>`
2✔
86
  } else if (mimeExtension(contentType) === 'json') {
13✔
87
    element = `<pre>${html}</pre>`
1✔
88
  }
1✔
89

18✔
90
  return `<!DOCTYPE html><html><head></head><body>${element}</body></html>`
18✔
91
}
18✔
92

13✔
93
const isOpenGraph = (prop = '') =>
13✔
94
  ['og:', 'fb:', 'al:'].some(prefix => prop.startsWith(prefix))
44✔
95

13✔
96
const rewriteMetaTags = ({ $ }) => {
13✔
97
  $('meta').each((_, element) => {
12✔
98
    const el = $(element)
43✔
99
    if (!el.attr('content')) return
43✔
100

26✔
101
    const name = el.attr('name')
26✔
102
    const property = el.attr('property')
26✔
103

26✔
104
    // Convert 'name' to 'property' for Open Graph tags if 'property' is not already set correctly
26✔
105
    if (property !== name && isOpenGraph(name)) {
43✔
106
      el.removeAttr('name').attr('property', name)
8✔
107
      debug('og', el.attr())
8✔
108
      // Convert 'property' to 'name' for non-Open Graph tags
8✔
109
    } else if (property && !isOpenGraph(property)) {
43✔
110
      el.removeAttr('property').attr('name', property)
6✔
111
      debug('meta', el.attr())
6✔
112
    }
6✔
113
  })
12✔
114
}
12✔
115

13✔
116
const rewriteHtmlUrls = ({ $, url }) => {
13✔
117
  forEach(URL_TAGS, (tagName, urlAttr) => {
9✔
118
    $(tagName.join(',')).each(function () {
90✔
119
      const el = $(this)
49✔
120
      const attr = el.attr(urlAttr)
49✔
121
      if (typeof attr !== 'string') return
49✔
122
      try {
30✔
123
        const urlObj = new URL(attr, url)
30✔
124
        if (!urlObj.protocol.startsWith('http')) return
47✔
125
        if (isLocalAddress(urlObj.hostname)) {
47✔
126
          el.remove()
3✔
127
        } else {
47✔
128
          el.attr(urlAttr, urlObj.toString())
16✔
129
        }
16✔
130
      } catch (_) {}
49✔
131
    })
90✔
132
  })
9✔
133
}
9✔
134

13✔
135
const replaceCssUrls = (url, stylesheet) => {
13✔
136
  const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
3✔
137
    (acc, match) => {
3✔
138
      match.subMatches.forEach(match => acc.add(match))
3✔
139
      return acc
3✔
140
    },
3✔
141
    new Set()
3✔
142
  )
3✔
143

3✔
144
  cssUrls.forEach(cssUrl => {
3✔
145
    if (cssUrl.startsWith('/')) {
3✔
146
      try {
3✔
147
        const absoluteUrl = new URL(cssUrl, url).toString()
3✔
148
        stylesheet = stylesheet.replaceAll(
3✔
149
          `url(${cssUrl})`,
3✔
150
          `url(${absoluteUrl})`
3✔
151
        )
3✔
152
      } catch (_) {}
3!
153
    }
3✔
154
  })
3✔
155

3✔
156
  return stylesheet
3✔
157
}
3✔
158

13✔
159
const rewriteCssUrls = ({ $, url }) => {
13✔
160
  // Process <style> tags
9✔
161
  // e.g., <style>body { background-image: url('/image.jpg'); }</style>
9✔
162
  $('style').each((_, element) =>
9✔
163
    $(element).html(replaceCssUrls(url, $(element).html()))
1✔
164
  )
9✔
165

9✔
166
  // Process elements with style attributes
9✔
167
  // e.g., <div style="background-image: url('/image.jpg');"></div>
9✔
168
  $('[style]').each((_, element) =>
9✔
169
    $(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
2✔
170
  )
9✔
171

9✔
172
  return $
9✔
173
}
9✔
174

13✔
175
const injectStyle = ({ $, styles }) =>
13✔
176
  castArray(styles).forEach(style =>
3✔
177
    $('head').append(
4✔
178
      isUrl(style)
4✔
179
        ? `<link rel="stylesheet" type="text/css" href="${style}">`
4✔
180
        : `<style type="text/css">${style}</style>`
4✔
181
    )
4✔
182
  )
3✔
183

13✔
184
const injectScripts = ({ $, scripts, type }) =>
13✔
185
  castArray(scripts).forEach(script =>
1✔
186
    $('head').append(
2✔
187
      isUrl(script)
2✔
188
        ? `<script src="${script}" type="${type}"></script>`
2✔
189
        : `<script type="${type}">${script}</script>`
2✔
190
    )
2✔
191
  )
1✔
192

13✔
193
const addDocType = html =>
13✔
194
  html.startsWith('<!') ? html : `<!DOCTYPE html>${html}`
71✔
195

13✔
196
module.exports = ({
13✔
197
  html,
71✔
198
  url,
71✔
199
  headers = {},
71✔
200
  styles,
71✔
201
  hide,
71✔
202
  remove,
71✔
203
  rewriteUrls,
71✔
204
  rewriteHtml,
71✔
205
  scripts,
71✔
206
  modules
71✔
207
}) => {
71✔
208
  const content = addDocType(
71✔
209
    isHTML(html) ? html : addBody({ url, headers, html })
71✔
210
  )
71✔
211

71✔
212
  const $ = cheerio.load(content)
71✔
213

71✔
214
  if (rewriteUrls) rewriteHtmlUrls({ $, url })
71✔
215

71✔
216
  if (rewriteHtml) rewriteMetaTags({ $, url })
71✔
217

71✔
218
  addHead({ $, url, headers })
71✔
219

71✔
220
  if (styles) injectStyle({ $, styles })
71✔
221

71✔
222
  if (hide) {
71✔
223
    injectStyle({
1✔
224
      $,
1✔
225
      styles: `${castArray(hide).join(', ')} { visibility: hidden !important; }`
1✔
226
    })
1✔
227
  }
1✔
228

71✔
229
  if (remove) {
71✔
230
    injectStyle({
1✔
231
      $,
1✔
232
      styles: `${castArray(remove).join(', ')} { display: none !important; }`
1✔
233
    })
1✔
234
  }
1✔
235

71✔
236
  if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
71✔
237
  if (modules) injectScripts({ $, modules, type: 'module' })
71!
238

71✔
239
  return rewriteUrls ? rewriteCssUrls({ $, url }) : $
71✔
240
}
71✔
241

13✔
242
module.exports.getDate = getDate
13✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc