• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

thanos / petrify / 27228748356

09 Jun 2026 06:56PM UTC coverage: 85.0%. First build
27228748356

Pull #6

github

web-flow
Merge 52a443ec8 into e9de08346
Pull Request #6: New flow

542 of 638 new or added lines in 5 files covered. (84.95%)

544 of 640 relevant lines covered (85.0%)

2.3 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.15
/src/html_parser.rs
1
use crate::types::{Resource, ResourceType};
2
use anyhow::{anyhow, Result};
3
use html5ever::parse_document;
4
use html5ever::tendril::TendrilSink;
5
use markup5ever_rcdom::{Handle, NodeData, RcDom};
6
use regex::Regex;
7
use std::collections::{HashMap, HashSet};
8
use url::Url;
9

10
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
11
struct UrlReference {
12
    original: String,
13
    resolved: Url,
14
}
15

16
pub struct HtmlParser {
17
    base_url: Url,
18
    output_dir: String,
19
}
20

21
impl HtmlParser {
22
    pub fn new(base_url: Url, output_dir: String) -> Self {
3✔
23
        Self {
24
            base_url,
25
            output_dir,
26
        }
27
    }
28

29
    pub fn parse_html(&self, html_content: &str) -> Result<(String, Vec<Resource>)> {
3✔
30
        let dom = parse_document(RcDom::default(), Default::default())
6✔
31
            .from_utf8()
32
            .one(html_content.as_bytes());
6✔
33

34
        let mut resources = Vec::new();
3✔
35
        let mut modified_html = html_content.to_string();
6✔
36

37
        let references = self.extract_url_references(&dom.document)?;
6✔
38
        let mut local_paths = HashMap::new();
3✔
39

40
        for reference in &references {
6✔
41
            if local_paths.contains_key(reference.resolved.as_str()) {
6✔
42
                continue;
43
            }
44
            match self.create_resource(&reference.resolved) {
3✔
45
                Ok(resource) => {
3✔
46
                    local_paths.insert(reference.resolved.to_string(), resource.local_path.clone());
6✔
47
                    resources.push(resource);
3✔
48
                }
NEW
49
                Err(e) => {
×
NEW
50
                    log::warn!(
×
51
                        "Failed to create resource for {}: {}",
52
                        reference.resolved.as_str(),
53
                        e
54
                    );
55
                }
56
            }
57
        }
58

59
        for reference in &references {
3✔
60
            if let Some(local_path) = local_paths.get(reference.resolved.as_str()) {
9✔
61
                modified_html =
6✔
62
                    self.replace_url_in_html(&modified_html, &reference.original, local_path);
3✔
63
            }
64
        }
65

66
        Ok((modified_html, resources))
3✔
67
    }
68

69
    fn extract_url_references(&self, handle: &Handle) -> Result<Vec<UrlReference>> {
3✔
70
        let mut references = Vec::new();
3✔
71
        let mut seen = HashSet::new();
3✔
72
        self.walk_dom(handle, &mut references, &mut seen)?;
6✔
73
        Ok(references)
3✔
74
    }
75

76
    fn walk_dom(
3✔
77
        &self,
78
        handle: &Handle,
79
        references: &mut Vec<UrlReference>,
80
        seen: &mut HashSet<(String, String)>,
81
    ) -> Result<()> {
82
        let node = handle;
3✔
83

84
        match &node.data {
3✔
85
            NodeData::Element { ref attrs, .. } => {
3✔
86
                for attr in attrs.borrow().iter() {
6✔
87
                    if self.is_url_attribute(&attr.name.local) {
4✔
88
                        let url_str = attr.value.to_string();
2✔
89
                        if let Ok(url) = self.resolve_url(&url_str) {
8✔
90
                            let key = (url_str.clone(), url.to_string());
4✔
91
                            if seen.insert(key) {
2✔
92
                                references.push(UrlReference {
2✔
93
                                    original: url_str,
2✔
94
                                    resolved: url,
2✔
95
                                });
96
                            }
97
                        }
98
                    }
99
                }
100
            }
101
            NodeData::Text { ref contents } => {
3✔
102
                if let Some(urls_in_text) = self.extract_urls_from_text(&contents.borrow()) {
6✔
103
                    for url_str in urls_in_text {
4✔
104
                        if let Ok(url) = self.resolve_url(&url_str) {
4✔
105
                            let key = (url_str.clone(), url.to_string());
2✔
106
                            if seen.insert(key) {
1✔
107
                                references.push(UrlReference {
1✔
108
                                    original: url_str,
1✔
109
                                    resolved: url,
1✔
110
                                });
111
                            }
112
                        }
113
                    }
114
                }
115
            }
116
            _ => {}
117
        }
118

119
        for child in node.children.borrow().iter() {
6✔
120
            self.walk_dom(child, references, seen)?;
6✔
121
        }
122

123
        Ok(())
3✔
124
    }
125

126
    fn is_url_attribute(&self, attr_name: &str) -> bool {
2✔
127
        matches!(
4✔
128
            attr_name,
129
            "src"
130
                | "href"
131
                | "data-src"
132
                | "data-original"
133
                | "poster"
134
                | "background"
135
                | "data-srcset"
136
                | "data-lazy-src"
137
        )
138
    }
139

140
    fn extract_urls_from_text(&self, text: &str) -> Option<Vec<String>> {
3✔
141
        let mut urls = Vec::new();
3✔
142

143
        // Extract URLs from CSS @import statements
144
        let import_regex = Regex::new(r#"@import\s+["']([^"']+)["']"#).ok()?;
6✔
145
        for cap in import_regex.captures_iter(text) {
9✔
146
            if let Some(url) = cap.get(1) {
2✔
147
                urls.push(url.as_str().to_string());
2✔
148
            }
149
        }
150

151
        // Extract URLs from CSS url() functions
152
        let url_regex = Regex::new(r#"url\(["']?([^"')]+)["']?\)"#).ok()?;
3✔
153
        for cap in url_regex.captures_iter(text) {
9✔
154
            if let Some(url) = cap.get(1) {
2✔
155
                urls.push(url.as_str().to_string());
2✔
156
            }
157
        }
158

159
        if urls.is_empty() {
4✔
160
            None
2✔
161
        } else {
162
            Some(urls)
1✔
163
        }
164
    }
165

166
    fn resolve_url(&self, url_str: &str) -> Result<Url> {
3✔
167
        if url_str.starts_with("data:") || url_str.starts_with("#") {
3✔
168
            return Err(anyhow!("Skipping data URL or fragment"));
1✔
169
        }
170

171
        if url_str.starts_with("//") {
3✔
172
            let scheme = self.base_url.scheme();
1✔
173
            Ok(Url::parse(&format!("{scheme}:{url_str}"))?)
2✔
174
        } else if url_str.starts_with('/') {
6✔
175
            // Absolute path
176
            let mut url = self.base_url.clone();
2✔
177
            url.set_path(url_str);
2✔
178
            Ok(url)
2✔
179
        } else if url_str.starts_with("http://") || url_str.starts_with("https://") {
7✔
180
            // Absolute URL
181
            Ok(Url::parse(url_str)?)
1✔
182
        } else {
183
            // Relative URL
184
            Ok(self.base_url.join(url_str)?)
3✔
185
        }
186
    }
187

188
    fn create_resource(&self, url: &Url) -> Result<Resource> {
3✔
189
        let resource_type = self.determine_resource_type(url);
3✔
190
        let local_path = self.generate_local_path(url, &resource_type)?;
6✔
191

192
        Ok(Resource {
3✔
193
            url: url.clone(),
3✔
194
            local_path,
3✔
195
            resource_type: resource_type.clone(),
3✔
196
            mime_type: self.guess_mime_type(url, &resource_type),
3✔
197
            size: None,
198
            downloaded: false,
199
        })
200
    }
201

202
    fn determine_resource_type(&self, url: &Url) -> ResourceType {
3✔
203
        let path = url.path();
3✔
204
        let extension = path.split('.').next_back().unwrap_or("").to_lowercase();
3✔
205

206
        // Check for explicit file extensions first
207
        match extension.as_str() {
6✔
208
            "html" | "htm" => ResourceType::HTML,
3✔
209
            "css" => ResourceType::CSS,
6✔
210
            "js" | "javascript" => ResourceType::JavaScript,
6✔
211
            "jpg" | "jpeg" | "png" | "gif" | "webp" | "svg" | "ico" | "bmp" | "tiff" | "tif" => {
6✔
212
                ResourceType::Image
3✔
213
            }
214
            "mp4" | "webm" | "ogg" | "avi" | "mov" | "m4v" => ResourceType::Video,
2✔
215
            "pdf" => ResourceType::PDF,
2✔
216
            "woff" | "woff2" | "ttf" | "otf" | "eot" => ResourceType::Font,
4✔
217
            _ => {
218
                // For URLs without extensions, check if they look like HTML pages
219
                if self.looks_like_html_page(path) {
2✔
220
                    ResourceType::HTML
2✔
221
                } else {
NEW
222
                    ResourceType::Other
×
223
                }
224
            }
225
        }
226
    }
227

228
    fn looks_like_html_page(&self, path: &str) -> bool {
2✔
229
        // Skip empty paths and root
230
        if path.is_empty() || path == "/" {
2✔
NEW
231
            return false;
×
232
        }
233

234
        // Check if the path ends with a slash (directory-like)
235
        if path.ends_with('/') {
2✔
NEW
236
            return true;
×
237
        }
238

239
        // Check if the path looks like a content page (not a file)
240
        let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
6✔
241

242
        // If it's a single segment without extension, likely a page
243
        if segments.len() == 1 && !segments[0].contains('.') {
6✔
244
            return true;
2✔
245
        }
246

247
        // If it's multiple segments and the last one doesn't have an extension, likely a page
NEW
248
        if segments.len() > 1 {
×
NEW
249
            let last_segment = segments.last().unwrap_or(&"");
×
NEW
250
            if !last_segment.contains('.') && !last_segment.is_empty() {
×
NEW
251
                return true;
×
252
            }
253
        }
254

NEW
255
        false
×
256
    }
257

258
    fn generate_local_path(&self, url: &Url, resource_type: &ResourceType) -> Result<String> {
3✔
259
        let path = url.path();
3✔
260

261
        let subdirectory = match resource_type {
3✔
262
            ResourceType::HTML => "", // HTML files maintain original structure
2✔
263
            ResourceType::CSS => "static/css",
3✔
264
            ResourceType::JavaScript => "static/js",
2✔
265
            ResourceType::Image => "static/images",
3✔
NEW
266
            ResourceType::Video => "static/video",
×
NEW
267
            ResourceType::PDF => "static/pdf",
×
NEW
268
            ResourceType::Font => "static/fonts",
×
NEW
269
            ResourceType::Other => "static/other",
×
270
        };
271

272
        // For HTML files, preserve the original directory structure
273
        if *resource_type == ResourceType::HTML {
3✔
274
            let path_segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
6✔
275

276
            if path_segments.is_empty() || path == "/" {
6✔
277
                // Root page
NEW
278
                return Ok(format!("{}/index.html", self.output_dir));
×
279
            } else {
280
                // Create directory structure matching the original URL
281
                let dir_path = path_segments[..path_segments.len() - 1].join("/");
2✔
282
                let filename = path_segments.last().unwrap_or(&"index");
4✔
283

284
                // Handle trailing slash (directory-like URLs)
285
                if path.ends_with('/') {
2✔
NEW
286
                    if dir_path.is_empty() {
×
NEW
287
                        return Ok(format!("{}/{}/index.html", self.output_dir, filename));
×
288
                    } else {
NEW
289
                        return Ok(format!(
×
290
                            "{}/{}/{}/index.html",
291
                            self.output_dir, dir_path, filename
292
                        ));
293
                    }
294
                } else {
295
                    // Regular file path
296
                    if dir_path.is_empty() {
4✔
297
                        return Ok(format!("{}/{}.html", self.output_dir, filename));
4✔
298
                    } else {
NEW
299
                        return Ok(format!(
×
300
                            "{}/{}/{}.html",
301
                            self.output_dir, dir_path, filename
302
                        ));
303
                    }
304
                }
305
            }
306
        }
307

308
        // For non-HTML resources, use the existing logic
309
        let filename = path.split('/').next_back().unwrap_or("unknown");
3✔
310
        let mut unique_filename = filename.to_string();
3✔
311
        if filename == "index.html" || filename.is_empty() {
9✔
NEW
312
            let host = url.host_str().unwrap_or("unknown");
×
NEW
313
            let path_segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
×
NEW
314
            if path_segments.is_empty() {
×
NEW
315
                unique_filename = format!("{}.html", host);
×
316
            } else {
NEW
317
                unique_filename = format!("{}.html", path_segments.join("_"));
×
318
            }
319
        }
320

321
        Ok(format!(
6✔
322
            "{}/{}/{}",
323
            self.output_dir, subdirectory, unique_filename
324
        ))
325
    }
326

327
    fn guess_mime_type(&self, url: &Url, resource_type: &ResourceType) -> String {
3✔
328
        let path = url.path();
3✔
329
        let extension = path.split('.').next_back().unwrap_or("").to_lowercase();
3✔
330

331
        match extension.as_str() {
6✔
332
            "html" | "htm" => "text/html".to_string(),
5✔
333
            "css" => "text/css".to_string(),
9✔
334
            "js" => "application/javascript".to_string(),
10✔
335
            "jpg" | "jpeg" => "image/jpeg".to_string(),
7✔
336
            "png" => "image/png".to_string(),
9✔
337
            "gif" => "image/gif".to_string(),
4✔
338
            "webp" => "image/webp".to_string(),
4✔
339
            "svg" => "image/svg+xml".to_string(),
4✔
340
            "ico" => "image/x-icon".to_string(),
4✔
341
            "mp4" => "video/mp4".to_string(),
4✔
342
            "webm" => "video/webm".to_string(),
4✔
343
            "ogg" => "video/ogg".to_string(),
4✔
344
            "pdf" => "application/pdf".to_string(),
4✔
345
            "woff" => "font/woff".to_string(),
4✔
346
            "woff2" => "font/woff2".to_string(),
4✔
347
            "ttf" => "font/ttf".to_string(),
4✔
348
            _ => match resource_type {
2✔
349
                ResourceType::HTML => "text/html".to_string(),
2✔
NEW
350
                ResourceType::CSS => "text/css".to_string(),
×
351
                ResourceType::JavaScript => "application/javascript".to_string(),
2✔
NEW
352
                ResourceType::Image => "image/jpeg".to_string(),
×
NEW
353
                ResourceType::Video => "video/mp4".to_string(),
×
NEW
354
                ResourceType::PDF => "application/pdf".to_string(),
×
NEW
355
                ResourceType::Font => "font/woff".to_string(),
×
NEW
356
                ResourceType::Other => "application/octet-stream".to_string(),
×
357
            },
358
        }
359
    }
360

361
    fn replace_url_in_html(&self, html: &str, original_url: &str, local_path: &str) -> String {
3✔
362
        let relative_path = self.make_relative_path(local_path);
3✔
363
        html.replace(original_url, &relative_path)
6✔
364
    }
365

366
    fn make_relative_path(&self, local_path: &str) -> String {
3✔
367
        // Convert absolute path to relative path from the HTML file location
368
        if let Some(relative) = local_path.strip_prefix(&self.output_dir) {
3✔
369
            relative.strip_prefix('/').unwrap_or(relative).to_string()
3✔
370
        } else {
NEW
371
            local_path.to_string()
×
372
        }
373
    }
374
}
375

376
#[cfg(test)]
377
mod tests {
378
    use super::*;
379

380
    #[test]
381
    fn test_url_resolution() {
382
        let base_url = Url::parse("https://example.com/page/").unwrap();
383
        let parser = HtmlParser::new(base_url, "./output".to_string());
384

385
        let relative_url = "image.jpg";
386
        let resolved = parser.resolve_url(relative_url).unwrap();
387
        assert_eq!(resolved.as_str(), "https://example.com/page/image.jpg");
388
    }
389

390
    #[test]
391
    fn test_resource_type_detection() {
392
        let base_url = Url::parse("https://example.com/").unwrap();
393
        let parser = HtmlParser::new(base_url, "./output".to_string());
394

395
        let image_url = Url::parse("https://example.com/image.png").unwrap();
396
        let resource_type = parser.determine_resource_type(&image_url);
397
        assert!(matches!(resource_type, ResourceType::Image));
398
    }
399

400
    #[test]
401
    fn test_protocol_relative_url_resolution() {
402
        let base_url = Url::parse("http://example.com/page/").unwrap();
403
        let parser = HtmlParser::new(base_url, "./output".to_string());
404

405
        let resolved = parser.resolve_url("//cdn.example.com/lib.js").unwrap();
406
        assert_eq!(resolved.as_str(), "http://cdn.example.com/lib.js");
407
    }
408

409
    #[test]
410
    fn test_extensionless_path_is_html() {
411
        let base_url = Url::parse("https://example.com/").unwrap();
412
        let parser = HtmlParser::new(base_url, "./output".to_string());
413

414
        let about = Url::parse("https://example.com/about").unwrap();
415
        assert_eq!(parser.determine_resource_type(&about), ResourceType::HTML);
416
    }
417

418
    #[test]
419
    fn test_absolute_path_resolution() {
420
        let base_url = Url::parse("https://example.com/page/").unwrap();
421
        let parser = HtmlParser::new(base_url, "./output".to_string());
422
        let resolved = parser.resolve_url("/assets/app.css").unwrap();
423
        assert_eq!(resolved.as_str(), "https://example.com/assets/app.css");
424
    }
425

426
    #[test]
427
    fn test_absolute_https_url_resolution() {
428
        let base_url = Url::parse("https://example.com/").unwrap();
429
        let parser = HtmlParser::new(base_url, "./output".to_string());
430
        let resolved = parser
431
            .resolve_url("https://cdn.example.com/lib.js")
432
            .unwrap();
433
        assert_eq!(resolved.as_str(), "https://cdn.example.com/lib.js");
434
    }
435

436
    #[test]
437
    fn test_skips_data_and_fragment_urls() {
438
        let base_url = Url::parse("https://example.com/").unwrap();
439
        let parser = HtmlParser::new(base_url, "./output".to_string());
440
        assert!(parser.resolve_url("#section").is_err());
441
        assert!(parser.resolve_url("data:image/png;base64,abc").is_err());
442
    }
443

444
    #[test]
445
    fn extracts_urls_from_inline_css() {
446
        let html = r#"<html><head><style>
447
            @import "imported.css";
448
            body { background: url(bg.png); }
449
        </style></head></html>"#;
450
        let base_url = Url::parse("https://example.com/").unwrap();
451
        let parser = HtmlParser::new(base_url, "./output".to_string());
452
        let (_, resources) = parser.parse_html(html).unwrap();
453
        assert!(resources
454
            .iter()
455
            .any(|r| r.url.path().ends_with("imported.css")));
456
        assert!(resources.iter().any(|r| r.url.path().ends_with("bg.png")));
457
    }
458

459
    #[test]
460
    fn guess_mime_type_falls_back_to_resource_type() {
461
        let base_url = Url::parse("https://example.com/").unwrap();
462
        let parser = HtmlParser::new(base_url, "./output".to_string());
463
        let url = Url::parse("https://example.com/no-extension").unwrap();
464
        assert_eq!(
465
            parser.guess_mime_type(&url, &ResourceType::JavaScript),
466
            "application/javascript"
467
        );
468
    }
469

470
    #[test]
471
    fn make_relative_path_strips_output_prefix() {
472
        let base_url = Url::parse("https://example.com/").unwrap();
473
        let parser = HtmlParser::new(base_url, "/output".to_string());
474
        let relative = parser.make_relative_path("/output/static/css/app.css");
475
        assert_eq!(relative, "static/css/app.css");
476
    }
477
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc